From e402e3eb9cb8484c21811f0dd4e5b83aa7ff29ca Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Wed, 24 Jul 2019 22:47:02 +0700 Subject: [PATCH 1/6] add few item to todo list --- TODO.org | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/TODO.org b/TODO.org index 760e065..658d2c0 100644 --- a/TODO.org +++ b/TODO.org @@ -1,4 +1,4 @@ - + * List of performance metrics @@ -23,6 +23,8 @@ Metrics that built around confusion matrix: - [X] Balanced Accuracy +- [ ] Balanced Error Rate + - [X] Positive Predicted Value (PPV) / Precision - [ ] Average Precision @@ -31,12 +33,20 @@ Metrics that built around confusion matrix: - [X] False Omission Rate (FOR) +- [ ] Positive Likelihood + +- [ ] Negative Likelihood + - [X] Prevalence - [X] F1 Score +- [ ] F Measure (Weighted Harmonic Mean Between Precision And Recall) + - [X] Matthews Correlation Coefficient (MCC) +- [ ] Discriminant Power + - [X] Informedness (Bookmaker Informedness - BM) / Youden Index (Youden's J Statistic) - [X] Markedness (MK) @@ -77,14 +87,26 @@ Proper scoring rule: - [X] Mean Squared Error +- [ ] Normalized Mean Squared Error + - [X] Root Mean Squared Error - [X] Mean Squared Logarithmic Error - [X] Median Absolute Error +- [ ] Mean Absolute Percentage Error + +- [ ] Mean Absolute Scaled Error + +- [ ] Median Squared Error + - [X] R2 Score +- [ ] Adjusted R2 Score + +- [ ] M-Estimators + ** Clustering tasks - [ ] Adjusted Mututal Information Score / Mutual Information Score From e5eca2d7eeebf896606e44d35269b9052af21e92 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sat, 27 Jul 2019 22:36:37 +0700 Subject: [PATCH 2/6] remove F-measure line from todo list as already implemented --- TODO.org | 2 -- 1 file changed, 2 deletions(-) diff --git a/TODO.org b/TODO.org index 658d2c0..c29993d 100644 --- a/TODO.org +++ b/TODO.org @@ -41,8 +41,6 @@ Metrics that built around confusion matrix: - [X] F1 Score -- [ ] F Measure (Weighted Harmonic Mean Between Precision And Recall) - - [X] Matthews Correlation Coefficient (MCC) - [ ] Discriminant Power From 249af3aa58a2340c58c357c356ac72034783de08 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Sat, 27 Jul 2019 23:08:15 +0700 Subject: [PATCH 3/6] add code, helper function, tests for mutual info score --- R/clustering.r | 59 ++++++++++++++++++++++++++++++ R/helper-functions.r | 65 ++++++++++++++++++++++++++++++++- inst/tinytest/test-clustering.r | 23 ++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 R/clustering.r create mode 100644 inst/tinytest/test-clustering.r diff --git a/R/clustering.r b/R/clustering.r new file mode 100644 index 0000000..4380947 --- /dev/null +++ b/R/clustering.r @@ -0,0 +1,59 @@ +##' @title +##' Clustering Metrics Parameters +##' +##' @description +##' Documentation for shared parameters of functions that compute clustering +##' metrics. +##' +##' @param actual \code{[numeric]} The ground truth numeric vector. +##' @param predicted \code{[numeric]} The predicted numeric vector, where each +##' element in the vector is a prediction of the corresponding elements in +##' \code{actual}. +##' @name clustering_params +##' @include helper-functions.r +NULL + + +##' @title +##' Adjusted Mutual Information Score / Mututal Information Score +##' +##' +##' @description +##' +##' \code{mtr_mutual_info_score} measures the similarity, or mutual dependence +##' between two variable. The worst possible score is 0, higher values are +##' better. +##' +##' +##' @inheritParams clustering_params +##' @importFrom stats var +##' @seealso \code{\link{mtr_r2}} +##' @return A numeric scalar output +##' @author Phuc Nguyen +##' @examples +##' +##' act <- sample(1:10, 100, replace = T) +##' pred <- sample(1:10, 100, replace = T) +##' mtr_mutual_info_score(act, pred) +##' +##' act <- rep(c('a', 'b', 'c'), times = 4) +##' pred <- rep(c('a', 'b', 'c'), each = 4) +##' mtr_mutual_info_score(act, pred) +##' +##' @export +mtr_mutual_info_score <- function(actual, predicted) { + chec_empty_vec(actual) + check_equal_length(actual, predicted) + entropy(actual) + entropy(predicted) - joint_entropy(vec_1 = actual, + vec_2 = predicted) +} + +mtr_normalized_mutual_info_score <- function(actual, predicted) { + mtr_mutual_info_score(actual = actual, predicted = predicted) / + mean(c(entropy(vec = actual), entropy(vec = predicted))) +} + +mtr_adjusted_mutual_info_score <- function(actual, predicted) { + (mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) / + (mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted)) +} diff --git a/R/helper-functions.r b/R/helper-functions.r index 12d9585..67125ed 100644 --- a/R/helper-functions.r +++ b/R/helper-functions.r @@ -1,5 +1,11 @@ - +chec_empty_vec <- function(vec) { + if (length(vec) == 0) { + stop("vector must have positive length.", call. = FALSE) + } + + invisible() +} check_equal_length <- function(actual, predicted) { @@ -60,3 +66,60 @@ trapezoid <- function(x, y) { sum(dx * height) } + +class_prob <- function(vec, class) { + chec_empty_vec(vec) + length(which(vec == class)) / length(vec) +} + +entropy <- function(vec) { + chec_empty_vec(vec) + li = c() + for (cl in unique(vec)) { + m = class_prob(vec = vec, class = cl) + li = c(li, -1 * m * log(m)) + } + etp = sum(li, na.rm = TRUE) + etp +} + +joint_class_prob <- function(vec_1, vec_2, class_1, class_2) { + chec_empty_vec(vec_1) + check_equal_length(vec_1, vec_2) + length(which(vec_1 == class_1 & vec_2 == class_2)) / length(vec_1) +} + +joint_entropy <- function(vec_1, vec_2) { + check_equal_length(vec_1, vec_2) + li = c() + for(cl_1 in unique(vec_1)) { + for(cl_2 in unique(vec_2)) { + m = joint_class_prob(vec_1 = vec_1, vec_2 = vec_2, + class_1 = cl_1, class_2 = cl_2) + li = c(li, - 1 * m * log(m)) + } + } + joint_etp = sum(li, na.rm = TRUE) + joint_etp +} + +expected_mutual_info <- function(vec_1, vec_2) { + check_equal_length(vec_1, vec_2) + N = length(vec_1) + li = c() + for (i in unique(vec_1)) { + a = length(which(vec_1 == i)) + for (j in unique(vec_2)) { + b = length(which(vec_2 == j)) + for (nij in max(a + b - N, 0, na.rm = TRUE): min(a, b, na.rm = TRUE)) { + li = c(li, (nij / N) * + log((N * nij) / (a * b)) * + (factorial(a) * factorial(b) * factorial(N - a) * factorial(N - b)) / + (factorial(N) * factorial(nij) * factorial(a - nij) * factorial(b - nij) * factorial(N - a - b + nij))) + } + } + } + emi = sum(li, na.rm = TRUE) + emi +} + diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r new file mode 100644 index 0000000..998e093 --- /dev/null +++ b/inst/tinytest/test-clustering.r @@ -0,0 +1,23 @@ + +## test correctness ------------------------------------------------------------ + +vec_a = c(0, 1, 2, 0, 3, 4, 5, 1) +vec_b = c(1, 1, 0, 0, 2, 2, 2, 2) + +tinytest::expect_equal( + mtr_mutual_info_score(vec_a, vec_b), + target = 0.693147180559945, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_normalized_mutual_info_score(vec_a, vec_b), + target = 0.5163977794943221, + tol = 1e-7 +) + +tinytest::expect_equal( + mtr_adjusted_mutual_info_score(vec_a, vec_b), + target = -0.10526315789473674, + tol = 1e-7 +) From c998ce2938c22f75d38cf27ef9b8a4c205247c53 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Mon, 29 Jul 2019 01:32:43 +0700 Subject: [PATCH 4/6] update test due to detecting error in sklearn implementation --- inst/tinytest/test-clustering.r | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 998e093..97007c7 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -12,12 +12,18 @@ tinytest::expect_equal( tinytest::expect_equal( mtr_normalized_mutual_info_score(vec_a, vec_b), - target = 0.5163977794943221, + # target = 0.5163977794943221, + # changed test value due to respective example in sklearn.metrics is + # wrongly implemented + target = 0.5, tol = 1e-7 ) tinytest::expect_equal( mtr_adjusted_mutual_info_score(vec_a, vec_b), - target = -0.10526315789473674, + # target = -0.10526315789473674, + # changed test value due to respective example in sklearn.metrics is + # wrongly implemented + target = -0.1666666667, tol = 1e-7 ) From 3c27f8ac090b4e682f5adbe068916e9bba88e289 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Mon, 29 Jul 2019 01:36:48 +0700 Subject: [PATCH 5/6] update namespace and TODO list --- NAMESPACE | 3 +++ TODO.org | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 8442b94..77b6ef5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,9 @@ export(mtr_tpr) export(mtr_true_negative_rate) export(mtr_true_positive_rate) export(mtr_youden_index) +export(mtr_mutual_info_score) +export(mtr_normalized_mutual_info_score) +export(mtr_adjusted_mutual_info_score) importFrom(Rcpp,evalCpp) importFrom(stats,complete.cases) importFrom(stats,median) diff --git a/TODO.org b/TODO.org index c29993d..fb2ff27 100644 --- a/TODO.org +++ b/TODO.org @@ -107,7 +107,7 @@ Proper scoring rule: ** Clustering tasks -- [ ] Adjusted Mututal Information Score / Mutual Information Score +- [X] Adjusted Mututal Information Score / Mutual Information Score - [ ] Adjusted Rand Score From 679c0cdeaf52a876e37829b7a6693392ab913e95 Mon Sep 17 00:00:00 2001 From: minhsphuc12 Date: Mon, 29 Jul 2019 19:54:48 +0700 Subject: [PATCH 6/6] fix comment on why test value is not the same as python version --- inst/tinytest/test-clustering.r | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r index 97007c7..404d25b 100644 --- a/inst/tinytest/test-clustering.r +++ b/inst/tinytest/test-clustering.r @@ -14,7 +14,7 @@ tinytest::expect_equal( mtr_normalized_mutual_info_score(vec_a, vec_b), # target = 0.5163977794943221, # changed test value due to respective example in sklearn.metrics is - # wrongly implemented + # for version 0.21. Below value is compatible with version 0.22. target = 0.5, tol = 1e-7 ) @@ -23,7 +23,7 @@ tinytest::expect_equal( mtr_adjusted_mutual_info_score(vec_a, vec_b), # target = -0.10526315789473674, # changed test value due to respective example in sklearn.metrics is - # wrongly implemented + # for version 0.21. Below value is compatible with version 0.22. target = -0.1666666667, tol = 1e-7 )