From 249af3aa58a2340c58c357c356ac72034783de08 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sat, 27 Jul 2019 23:08:15 +0700 Subject: [PATCH 1/6] add code, helper function, tests for mutual info score --- R/clustering.r | 59 ++++++++++++++++++++++++++++++ R/helper-functions.r | 65 ++++++++++++++++++++++++++++++++- inst/tinytest/test-clustering.r | 23 ++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 R/clustering.r create mode 100644 inst/tinytest/test-clustering.r diff --git a/R/clustering.r b/R/clustering.r new file mode 100644 index 0000000..4380947 --- /dev/null +++ b/R/clustering.r @@ -0,0 +1,59 @@ +##' @title +##' Clustering Metrics Parameters +##' +##' @description +##' Documentation for shared parameters of functions that compute clustering +##' metrics. +##' +##' @param actual \code{[numeric]} The ground truth numeric vector. +##' @param predicted \code{[numeric]} The predicted numeric vector, where each +##' element in the vector is a prediction of the corresponding elements in +##' \code{actual}. +##' @name clustering_params +##' @include helper-functions.r +NULL + + +##' @title +##' Adjusted Mutual Information Score / Mututal Information Score +##' +##' +##' @description +##' +##' \code{mtr_mutual_info_score} measures the similarity, or mutual dependence +##' between two variable. The worst possible score is 0, higher values are +##' better. +##' +##' +##' @inheritParams clustering_params +##' @importFrom stats var +##' @seealso \code{\link{mtr_r2}} +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_mutual_info_score(act, pred) +##' +##' act <- rep(c('a', 'b', 'c'), times = 4) +##' pred <- rep(c('a', 'b', 'c'), each = 4) +##' mtr_mutual_info_score(act, pred) +##' +##' @export +mtr_mutual_info_score <- function(actual, predicted) { + chec_empty_vec(actual) + check_equal_length(actual, predicted) + entropy(actual) + entropy(predicted) - joint_entropy(vec_1 = actual, + vec_2 = predicted) +} + +mtr_normalized_mutual_info_score <- function(actual, predicted) { + mtr_mutual_info_score(actual = actual, predicted = predicted) / + mean(c(entropy(vec = actual), entropy(vec = predicted))) +} + +mtr_adjusted_mutual_info_score <- function(actual, predicted) { + (mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) / + (mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted)) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index 12d9585..67125ed 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -1,5 +1,11 @@ - +chec_empty_vec <- function(vec) { + if (length(vec) == 0) { + stop("vector must have positive length.", call. = FALSE) + } + + invisible() +} check_equal_length <- function(actual, predicted) { @@ -60,3 +66,60 @@ trapezoid <- function(x, y) { sum(dx * height) } + +class_prob <- function(vec, class) { + chec_empty_vec(vec) + length(which(vec == class)) / length(vec) +} + +entropy <- function(vec) { + chec_empty_vec(vec) + li = c() + for (cl in unique(vec)) { + m = class_prob(vec = vec, class = cl) + li = c(li, -1 * m * log(m)) + } + etp = sum(li, na.rm = TRUE) + etp +} + +joint_class_prob <- function(vec_1, vec_2, class_1, class_2) { + chec_empty_vec(vec_1) + check_equal_length(vec_1, vec_2) + length(which(vec_1 == class_1 & vec_2 == class_2)) / length(vec_1) +} + +joint_entropy <- function(vec_1, vec_2) { + check_equal_length(vec_1, vec_2) + li = c() + for(cl_1 in unique(vec_1)) { + for(cl_2 in unique(vec_2)) { + m = joint_class_prob(vec_1 = vec_1, vec_2 = vec_2, + class_1 = cl_1, class_2 = cl_2) + li = c(li, - 1 * m * log(m)) + } + } + joint_etp = sum(li, na.rm = TRUE) + joint_etp +} + +expected_mutual_info <- function(vec_1, vec_2) { + check_equal_length(vec_1, vec_2) + N = length(vec_1) + li = c() + for (i in unique(vec_1)) { + a = length(which(vec_1 == i)) + for (j in unique(vec_2)) { + b = length(which(vec_2 == j)) + for (nij in max(a + b - N, 0, na.rm = TRUE): min(a, b, na.rm = TRUE)) { + li = c(li, (nij / N) * + log((N * nij) / (a * b)) * + (factorial(a) * factorial(b) * factorial(N - a) * factorial(N - b)) / + (factorial(N) * factorial(nij) * factorial(a - nij) * factorial(b - nij) * factorial(N - a - b + nij))) + } + } + } + emi = sum(li, na.rm = TRUE) + emi +} + diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r new file mode 100644 index 0000000..998e093 --- /dev/null +++ b/inst/tinytest/test-clustering.r @@ -0,0 +1,23 @@ + +## test correctness ------------------------------------------------------------ + +vec_a = c(0, 1, 2, 0, 3, 4, 5, 1) +vec_b = c(1, 1, 0, 0, 2, 2, 2, 2) + +tinytest::expect_equal( + mtr_mutual_info_score(vec_a, vec_b), + target = 0.693147180559945, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_normalized_mutual_info_score(vec_a, vec_b), + target = 0.5163977794943221, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_adjusted_mutual_info_score(vec_a, vec_b), + target = -0.10526315789473674, + tol = 1e-7 +) From ff217fa8e069435622033becb7927eef9f464a8a Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sun, 28 Jul 2019 15:09:36 +0700 Subject: [PATCH 2/6] add code, test, export for adjusted rand score --- NAMESPACE | 2 ++ R/clustering.r | 39 ++++++++++++++++++++++++++++++++- R/helper-functions.r | 2 +- TODO.org | 2 +- inst/tinytest/test-clustering.r | 6 +++++ 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 8442b94..844af39 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,8 @@ export(mtr_tpr) export(mtr_true_negative_rate) export(mtr_true_positive_rate) export(mtr_youden_index) +export(mtr_mutual_info_score) +export(mtr_adjusted_rand_score) importFrom(Rcpp,evalCpp) importFrom(stats,complete.cases) importFrom(stats,median) diff --git a/R/clustering.r b/R/clustering.r index 4380947..c9f8f3e 100644 --- a/R/clustering.r +++ b/R/clustering.r @@ -27,7 +27,7 @@ NULL ##' ##' @inheritParams clustering_params ##' @importFrom stats var -##' @seealso \code{\link{mtr_r2}} +##' @seealso \code{\link{mtr_adjusted_rand_score}} ##' @return A numeric scalar output ##' @author Phuc Nguyen ##' @examples @@ -57,3 +57,40 @@ mtr_adjusted_mutual_info_score <- function(actual, predicted) { (mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) / (mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted)) } + +##' @title +##' Adjusted Rand Score +##' +##' +##' @description +##' +##' \code{mtr_adjusted_rand_score} measures the similarity, or mutual dependence +##' between two variable. Perfect score is 1. Score between total random vectors +##' is close to 0. Score can be negative. +##' +##' +##' @inheritParams clustering_params +##' @importFrom base choose +##' @seealso \code{\link{mtr_mutual_info_score}} +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_adjusted_rand_score(act, pred) +##' +##' act <- rep(c('a', 'b', 'c'), times = 4) +##' pred <- rep(c('a', 'b', 'c'), each = 4) +##' mtr_adjusted_rand_score(act, pred) +##' +##' @export + +mtr_adjusted_rand_score <- function(actual, predicted) { + check_equal_length(actual, predicted) + N = length(actual) + a = sum(choose(table(actual, predicted), 2)) + b = sum(choose(table(actual), 2)) * sum(choose(table(predicted), 2)) / choose(N, 2) + c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2))) + (a - b) / (c - b) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index 67125ed..00fc92a 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -69,7 +69,7 @@ trapezoid <- function(x, y) { class_prob <- function(vec, class) { chec_empty_vec(vec) - length(which(vec == class)) / length(vec) + length(which(vec == class)) / length(vec = vec) } entropy <- function(vec) { diff --git a/TODO.org b/TODO.org index c29993d..1610a53 100644 --- a/TODO.org +++ b/TODO.org @@ -109,7 +109,7 @@ Proper scoring rule: - [ ] Adjusted Mututal Information Score / Mutual Information Score -- [ ] Adjusted Rand Score +- [X] Adjusted Rand Score - [ ] Calinski-Harabasz Score diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 998e093..607a40f 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -21,3 +21,9 @@ tinytest::expect_equal( target = -0.10526315789473674, tol = 1e-7 ) + +tinytest::expect_equal( + mtr_adjusted_rand_score(vec_a, vec_b), + target = -0.12903225806451613, + tol = 1e-7 +) From 3bdebb7efb41e113fabe0c749b0681887c41c938 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sun, 28 Jul 2019 21:45:46 +0700 Subject: [PATCH 3/6] add code, test, export for homogeneity, completeness and v measure --- NAMESPACE | 3 +++ R/clustering.r | 48 +++++++++++++++++++++++++++++++++ R/helper-functions.r | 16 ++++++++++- inst/tinytest/test-clustering.r | 21 +++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 844af39..6fcdf35 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,6 +69,9 @@ export(mtr_true_positive_rate) export(mtr_youden_index) export(mtr_mutual_info_score) export(mtr_adjusted_rand_score) +export(mtr_homogeneity) +export(mtr_completeness) +export(mtr_v_measure) importFrom(Rcpp,evalCpp) importFrom(stats,complete.cases) importFrom(stats,median) diff --git a/R/clustering.r b/R/clustering.r index c9f8f3e..f1199e5 100644 --- a/R/clustering.r +++ b/R/clustering.r @@ -94,3 +94,51 @@ mtr_adjusted_rand_score <- function(actual, predicted) { c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2))) (a - b) / (c - b) } + +##' @title +##' Homogeneity, Completeness, V-measure +##' +##' +##' @description +##' +##' \code{mtr_homogeneity} and \code{mtr_completeness} measures an aspect of the +##' quality of clustering algorithm, as the former measures how similar the +##' elements within each cluster to each other, and the later measures +##' the degree a cluster has cover elements of same labels. +##' Both scores are in range of 0 to 1, as worst to best. +##' \code{mtr_v_measure} is harmonic mean of homogeneity score and completeness +##' score. +##' +##' @inheritParams clustering_params +##' @importFrom base choose +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_homogeneity(act, pred) +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_completeness(act, pred) +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_v_measure(act, pred) +##' +##' @export + +mtr_homogeneity <- function(actual, predicted) { + 1 - conditional_entropy(actual, predicted) / entropy(actual) +} + +mtr_completeness <- function(actual, predicted) { + 1 - conditional_entropy(predicted, actual) / entropy(predicted) +} + +mtr_v_measure <- function(actual, predicted) { + h = mtr_homogeneity(actual, predicted) + c = mtr_completeness(actual, predicted) + 2 * h * c / (h + c) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index 00fc92a..d366341 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -69,7 +69,7 @@ trapezoid <- function(x, y) { class_prob <- function(vec, class) { chec_empty_vec(vec) - length(which(vec == class)) / length(vec = vec) + length(which(vec == class)) / length(vec) } entropy <- function(vec) { @@ -123,3 +123,17 @@ expected_mutual_info <- function(vec_1, vec_2) { emi } +conditional_entropy <- function(vec_1, vec_2) { + check_equal_length(vec_1, vec_2) + N = length(vec_1) + li = c() + for (i in unique(vec_1)) { + for (j in unique(vec_2)) { + b = length(which(vec_2 == j)) + nij = length(which(vec_1 == i & vec_2 == j)) + li = c(li, - nij / N * log(nij / b)) + } + } + cond_entropy = sum(li, na.rm = TRUE) + cond_entropy +} diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 607a40f..0ce4d78 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -27,3 +27,24 @@ tinytest::expect_equal( target = -0.12903225806451613, tol = 1e-7 ) + +tinytest::expect_equal( + mtr_homogeneity(vec_a, vec_b), + target = 0.3999999999999998, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_completeness(vec_a, vec_b), + target = 0.6666666666666665, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_v_measure(vec_a, vec_b), + target = 0.4999999999999998, + tol = 1e-7 +) + + + From 6efe5f33722209955b6418498e7fa04c83311d36 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sun, 28 Jul 2019 21:50:20 +0700 Subject: [PATCH 4/6] update todo list for homogeneity, completeness, v_measure --- TODO.org | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TODO.org b/TODO.org index 1610a53..69a01f4 100644 --- a/TODO.org +++ b/TODO.org @@ -115,10 +115,10 @@ Proper scoring rule: - [ ] Davies-Bouldin Score -- [ ] Completeness Metric +- [X] Completeness Metric -- [ ] V-Measure Score +- [X] V-Measure Score -- [ ] Homogeneity Score +- [X] Homogeneity Score - [ ] Mean Silhouette Coefficient / Silhouette Coefficient From ae03069055865ba61a26e262bbfbe02053efde61 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Mon, 29 Jul 2019 01:29:23 +0700 Subject: [PATCH 5/6] add code, test, export for Calinski-Harabasz Score --- NAMESPACE | 1 + R/clustering.r | 49 ++++++++++++++++++++++++++++++++- R/helper-functions.r | 9 ++++++ TODO.org | 2 +- inst/tinytest/test-clustering.r | 17 ++++++++++++ 5 files changed, 76 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6fcdf35..8b6f174 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ export(mtr_adjusted_rand_score) export(mtr_homogeneity) export(mtr_completeness) export(mtr_v_measure) +export(mtr_calinski_harabasz) importFrom(Rcpp,evalCpp) importFrom(stats,complete.cases) importFrom(stats,median) diff --git a/R/clustering.r b/R/clustering.r index f1199e5..7ed030d 100644 --- a/R/clustering.r +++ b/R/clustering.r @@ -110,7 +110,6 @@ mtr_adjusted_rand_score <- function(actual, predicted) { ##' score. ##' ##' @inheritParams clustering_params -##' @importFrom base choose ##' @return A numeric scalar output ##' @author Phuc Nguyen ##' @examples @@ -142,3 +141,51 @@ mtr_v_measure <- function(actual, predicted) { c = mtr_completeness(actual, predicted) 2 * h * c / (h + c) } + +##' @title +##' Calinski-Harabasz Score (Variance Ratio Criterion) +##' +##' +##' @description +##' +##' \code{mtr_calinski_harabasz} measure the 'goodness' of clustering model +##' output, in case ground truth is unknown. Higher score mean cluster are dense +##' and well separated. +##' +##' @inheritParams clustering_params +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' dt <- iris[,-5] +##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, +##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, +##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0) +##' mtr_calinski_harabasz(dt, pred) +##' +##' @export + +mtr_calinski_harabasz <- function(matrix_feature, predicted) { + dt_center = apply(matrix_feature, 2, FUN = mean) + N = length(predicted) + num_cluster = length(unique(predicted)) + + check_number_of_labels(n_labels = num_cluster, n_samples = N) + + dispersion_between_cluster = 0 + dispersion_within_cluster = 0 + + for (cluster_val in unique(predicted)) { + size_cluster = length(which(predicted == cluster_val)) + dt_cluster = matrix_feature[which(predicted == cluster_val),] + cluster_center = apply(dt_cluster, 2, FUN = mean) + + dispersion_between_cluster = dispersion_between_cluster + size_cluster * sum((t(cluster_center) - dt_center) ^ 2) + dispersion_within_cluster = dispersion_within_cluster + sum((t(dt_cluster) - cluster_center) ^ 2) + } + + (dispersion_between_cluster / dispersion_within_cluster) * ((N - num_cluster) / (num_cluster - 1)) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index d366341..6b1a94e 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -34,6 +34,15 @@ check_binary <- function(actual) { invisible() } +check_number_of_labels <- function(n_labels, n_samples) { + + if (! (1 < n_labels & n_labels < n_samples)) { + stop("Number of labels is invalid. Valid value are 2 to n_samples - 1", call. = FALSE) + } + + invisible() +} + clip <- function(x, mi, ma) { clip_(x, mi, ma) } diff --git a/TODO.org b/TODO.org index 69a01f4..871286c 100644 --- a/TODO.org +++ b/TODO.org @@ -111,7 +111,7 @@ Proper scoring rule: - [X] Adjusted Rand Score -- [ ] Calinski-Harabasz Score +- [X] Calinski-Harabasz Score - [ ] Davies-Bouldin Score diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 0ce4d78..49f5e5a 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -3,6 +3,7 @@ vec_a = c(0, 1, 2, 0, 3, 4, 5, 1) vec_b = c(1, 1, 0, 0, 2, 2, 2, 2) +data(iris) tinytest::expect_equal( mtr_mutual_info_score(vec_a, vec_b), @@ -46,5 +47,21 @@ tinytest::expect_equal( tol = 1e-7 ) +tinytest::expect_equal( + mtr_calinski_harabasz(iris[,-5], + predicted = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, + 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, + 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)), + target = 560.3999242466402, + tol = 1e-2 + # numerical calculation outcome between python and R return large small + # difference, hence relatively large tolerance level +) + + From 400281f19fce529a16bca4bfe4d9f44bc8465fe1 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sat, 3 Aug 2019 03:10:10 +0700 Subject: [PATCH 6/6] add code, test, export for davies-boulding score --- NAMESPACE | 1 + R/clustering.r | 51 +++++++++++++++++++++++++++++++++ R/helper-functions.r | 29 +++++++++++++++++++ TODO.org | 2 +- inst/tinytest/test-clustering.r | 12 ++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 8b6f174..abd3e4e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,6 +73,7 @@ export(mtr_homogeneity) export(mtr_completeness) export(mtr_v_measure) export(mtr_calinski_harabasz) +export(mtr_davies_boulding_score) importFrom(Rcpp,evalCpp) importFrom(stats,complete.cases) importFrom(stats,median) diff --git a/R/clustering.r b/R/clustering.r index 7ed030d..891f871 100644 --- a/R/clustering.r +++ b/R/clustering.r @@ -153,6 +153,7 @@ mtr_v_measure <- function(actual, predicted) { ##' and well separated. ##' ##' @inheritParams clustering_params +##' @seealso \code{\link{mtr_davies_boulding_score}} ##' @return A numeric scalar output ##' @author Phuc Nguyen ##' @examples @@ -189,3 +190,53 @@ mtr_calinski_harabasz <- function(matrix_feature, predicted) { (dispersion_between_cluster / dispersion_within_cluster) * ((N - num_cluster) / (num_cluster - 1)) } + +##' @title +##' Davies-Bouldin Score +##' +##' +##' @description +##' +##' \code{mtr_davies_boulding_score} measure the 'goodness' of clustering model +##' output, in case ground truth is unknown. Lowest and best score is 0, +##' and lower score mean clusters are dense and separated +##' +##' @inheritParams clustering_params +##' @seealso \code{\link{mtr_calinski_harabasz}} +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' dt <- iris[,-5] +##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, +##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, +##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0) +##' mtr_calinski_harabasz(dt, pred) +##' +##' @export + +mtr_davies_boulding_score <- function(matrix_feature, predicted) { + check_equal_cluster_length(matrix_feature, predicted) + similarity_sum = 0 + for (cluster_val in unique(predicted)) { + dt_cluster = matrix_feature[which(predicted == cluster_val),] + cluster_center = apply(dt_cluster, 2, FUN = mean) + cluster_diameter = mean_distance(dt_cluster, cluster_center) + max_similarity = 0 + + for(other_cluster_val in unique(predicted)[unique(predicted) != cluster_val]) { + dt_other_cluster = matrix_feature[which(predicted == other_cluster_val),] + other_cluster_center = apply(dt_other_cluster, 2, FUN = mean) + other_cluster_diameter = mean_distance(dt_other_cluster, other_cluster_center) + between_distance = pairwise_distance(cluster_center, other_cluster_center) + similarity = (cluster_diameter + other_cluster_diameter) / between_distance + max_similarity = max(max_similarity, similarity) + } + similarity_sum = similarity_sum + max_similarity + } + + similarity_sum/length(unique(predicted)) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index 6b1a94e..b561ec1 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -16,6 +16,16 @@ check_equal_length <- function(actual, predicted) { invisible() } +check_equal_cluster_length <- function(dt, predicted) { + + if (nrow(dt) != length(predicted)) { + stop("`data` and `cluster` must have equal length.", call. = FALSE) + } + + invisible() +} + + check_cutoff_range <- function(cutoff) { if (cutoff > 1 || cutoff < 0) { @@ -146,3 +156,22 @@ conditional_entropy <- function(vec_1, vec_2) { cond_entropy = sum(li, na.rm = TRUE) cond_entropy } + +pairwise_distance <- function(vec_1, vec_2) { + # calculate euclidean distance between two points + check_equal_length(vec_1, vec_2) + sqrt(sum((vec_1 - vec_2) ^ 2)) +} + +mean_distance <- function(set, pt) { + # calculate euclidean distance between a set of point, represented by + # data frame, to a single point + total_dist = 0 + check_equal_length(set, pt) + for (i in 1:nrow(set)) { + ve = set[i,] + total_dist = total_dist + pairwise_distance(ve, pt) + } + total_dist / nrow(set) +} + diff --git a/TODO.org b/TODO.org index 871286c..6d659a1 100644 --- a/TODO.org +++ b/TODO.org @@ -113,7 +113,7 @@ Proper scoring rule: - [X] Calinski-Harabasz Score -- [ ] Davies-Bouldin Score +- [X] Davies-Bouldin Score - [X] Completeness Metric diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 49f5e5a..64fc1b7 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -62,6 +62,18 @@ tinytest::expect_equal( # difference, hence relatively large tolerance level ) +tinytest::expect_equal( + mtr_davies_boulding_score(iris[,-5], + predicted = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, + 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, + 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)), + target = 0.6619715, + tol = 1e-5 +)