diff --git a/DESCRIPTION b/DESCRIPTION index ae4fa1af..396507be 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,8 @@ LinkingTo: Rcpp Depends: R (>= 2.10) Suggests: + cluster, + clue, laeken, parallel, testthat, @@ -95,6 +97,7 @@ Collate: 'pram.R' 'rankSwap.R' 'RcppExports.R' + 'recordLinkage.R' 'recordSwap.R' 'report.R' 'riskyCells.R' diff --git a/NAMESPACE b/NAMESPACE index dd2285de..92407f39 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(print,measure_risk) S3method(print,micro) S3method(print,modrisk) S3method(print,pram) +S3method(print,recordLinkage) S3method(print,suda2) S3method(recordSwap,default) S3method(recordSwap,sdcMicroObj) @@ -64,6 +65,7 @@ export(plotMicro) export(pram) export(rankSwap) export(readMicrodata) +export(recordLinkage) export(recordSwap) export(removeDirectID) export(report) diff --git a/R/recordLinkage.R b/R/recordLinkage.R new file mode 100644 index 00000000..2c5b93ce --- /dev/null +++ b/R/recordLinkage.R @@ -0,0 +1,483 @@ +#' Record linkage via Global Distance-Based Record Linkage +#' +#' @description +#' Implements the Global Distance-Based Record Linkage +#' (GDBRL; Herranz et al., 2015), which links records in an original dataset to +#' records in an anonymized/protected dataset by computing pairwise distances on +#' selected linkage variables and then finding the minimum-total-distance +#' one-to-one matching via the Hungarian algorithm. +#' +#' This corresponds to an attacker scenario in which the adversary knows the +#' original data, or equivalent external information, for the linkage variables +#' and uses the released protected dataset to infer the most plausible global +#' record-to-record matching. +#' +#' @name recordLinkage +#' @docType methods +#' +#' @param x A `data.frame` containing the original data. +#' @param y A `data.frame` containing the anonymized/protected data. +#' @param vars Character vector of variable names used for record linkage. +#' These variables must exist in both `x` and `y`. +#' @param distance Character string specifying the distance metric. One of +#' `"gower"` (default), `"euclidean"`, or `"manhattan"`. +#' @param weights Optional numeric vector of variable weights passed to +#' [cluster::daisy()]. Must have length `length(vars)`. If `NULL`, equal +#' weights are used. +#' @param x_id Optional single character string naming the identifier column in +#' `x`. If `NULL`, row numbers are used as truth IDs. +#' @param y_id Optional single character string naming the identifier column in +#' `y`. If `NULL`, row numbers are used as truth IDs. +#' @param return_matrix Logical; if `TRUE`, the full pairwise distance matrix is +#' returned. +#' @param na_action Character string specifying how to handle missing values in +#' linkage variables. One of: +#' \describe{ +#' \item{`ignore`}{retain missing values and compute pairwise distances +#' using the subset of linkage variables observed for each record pair, as +#' handled by [cluster::daisy()]. Distances for different record pairs may +#' therefore be based on different numbers of variables. Missing values are +#' not treated as a separate category and do not contribute directly to the +#' corresponding variable-specific distance.} +#' \item{`fail`}{stop if linkage variables contain any missing values.} +#' } +#' @param tol Numeric tolerance used to determine tied minimum distances. +#' +#' @return An object of class `"recordLinkage"` with elements: +#' \describe{ +#' \item{matches}{A data.frame with matched pairs and corresponding distances} +#' \item{correct_matches}{Number of correctly linked records} +#' \item{correct_match_rate}{Proportion of correctly linked records} +#' \item{mean_distance}{Mean matched distance} +#' \item{total_distance}{Total matched distance} +#' \item{distance_matrix}{Optional full pairwise distance matrix} +#' \item{call}{The matched call} +#' } +#' +#' @details +#' The distance measure can be chosen via `distance`. Gower distance is suitable +#' for mixed-type quasi-identifiers, including numeric, factor, character, and +#' logical variables. Variables of class factor are treated as nominal variables, +#' while variables of class ordered are treated as ordinal variables. +#' Euclidean and Manhattan distances are supported for purely +#' numeric linkage variables. The Hungarian algorithm finds the global +#' minimum-cost one-to-one assignment. +#' +#' In addition to the global assignment, the function also returns the number of +#' candidates attaining the minimum distance (`n_best`). The quantity +#' `n_best` counts, for each record in `x`, how many records in `y` attain the +#' same minimum row-wise distance in the pairwise distance matrix. If multiple +#' optimal assignments exist, the chosen solution depends on the deterministic +#' behavior of [clue::solve_LSAP()] for the supplied cost matrix. +#' +#' For strict global assignment, `nrow(x)` must equal `nrow(y)`. +#' If `x_id` and `y_id` are not supplied, row order is treated as the truth for +#' evaluating correct matches. +#' +#' Results depend on both the matching direction and the row order of the input +#' data frames. +#' +#' @references +#' +#' Herranz, J., Nin, J., Rodríguez, P., and Tassa, T. (2015). \emph{Revisiting +#' distance-based record linkage for privacy-preserving release of statistical +#' datasets}. Data & Knowledge Engineering, 100, 78--93. +#' \doi{10.1016/j.datak.2015.07.009} +#' +#' Hornik, K. (2005). \emph{A CLUE for cluster ensembles}. +#' Journal of Statistical Software, 14(12). +#' \doi{10.18637/jss.v014.i12} +#' +#' Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., & Hornik, K. (2026). +#' \emph{cluster: Cluster Analysis Basics and Extensions}. +#' \url{https://CRAN.R-project.org/package=cluster} +#' +#' @examples +#' x <- data.frame( +#' id = c(1, 2, 3), +#' age = c(23, 40, 35), +#' sex = factor(c("f", "m", "f")), +#' region = c("A", "B", "A"), +#' stringsAsFactors = FALSE +#' ) +#' +#' y <- data.frame( +#' id = c(1, 2, 3), +#' age = c(24, 39, 35), +#' sex = factor(c("f", "m", "f")), +#' region = c("A", "B", "B"), +#' stringsAsFactors = FALSE +#' ) +#' +#' out <- recordLinkage( +#' x = x, +#' y = y, +#' vars = c("age", "sex", "region"), +#' distance = "gower", +#' x_id = "id", +#' y_id = "id" +#' ) +#' +#' out +#' out$matches +#' +#' @export +recordLinkage <- function(x, + y, + vars, + distance = c("gower", "euclidean", "manhattan"), + weights = NULL, + x_id = NULL, + y_id = NULL, + return_matrix = FALSE, + na_action = c("ignore", "fail"), + tol = sqrt(.Machine$double.eps)) { + + distance <- match.arg(distance) + na_action <- match.arg(na_action) + + # input checks --------------------------------------------------------------- + if (!is.data.frame(x)) { + stop("`x` must be a data.frame.", call. = FALSE) + } + if (!is.data.frame(y)) { + stop("`y` must be a data.frame.", call. = FALSE) + } + if (!is.character(vars) || length(vars) == 0L) { + stop("`vars` must be a non-empty character vector.", call. = FALSE) + } + if (!all(vars %in% names(x))) { + missing_x <- vars[!vars %in% names(x)] + stop("Variables not found in `x`: ", paste(missing_x, collapse = ", "), + call. = FALSE) + } + if (!all(vars %in% names(y))) { + missing_y <- vars[!vars %in% names(y)] + stop("Variables not found in `y`: ", paste(missing_y, collapse = ", "), + call. = FALSE) + } + if (anyDuplicated(vars)) { + stop("`vars` must not contain duplicate variable names.", call. = FALSE) + } + if (nrow(x) != nrow(y)) { + stop("`x` and `y` must have the same number of rows for strict one-to-one ", + "Hungarian assignment.", call. = FALSE) + } + if (!is.logical(return_matrix) || length(return_matrix) != 1L || is.na(return_matrix)) { + stop("`return_matrix` must be TRUE or FALSE.", call. = FALSE) + } + if (!is.numeric(tol) || length(tol) != 1L || !is.finite(tol) || tol < 0) { + stop("`tol` must be a single non-negative finite number.", call. = FALSE) + } + if (!requireNamespace("clue", quietly = TRUE)) { + stop("Package 'clue' is required for this function.", call. = FALSE) + } + if (!requireNamespace("cluster", quietly = TRUE)) { + stop("Package 'cluster' is required for this function.", call. = FALSE) + } + + # identifier handling -------------------------------------------------------- + if (is.null(x_id)) { + x_ids <- seq_len(nrow(x)) + } else { + if (!is.character(x_id) || length(x_id) != 1L) { + stop("`x_id` must be NULL or a single column name.", call. = FALSE) + } + if (!x_id %in% names(x)) { + stop("`x_id` not found in `x`.", call. = FALSE) + } + x_ids <- x[[x_id]] + } + + if (is.null(y_id)) { + y_ids <- seq_len(nrow(y)) + } else { + if (!is.character(y_id) || length(y_id) != 1L) { + stop("`y_id` must be NULL or a single column name.", call. = FALSE) + } + if (!y_id %in% names(y)) { + stop("`y_id` not found in `y`.", call. = FALSE) + } + y_ids <- y[[y_id]] + } + + if (length(x_ids) != nrow(x)) { + stop("Identifier column `x_id` must have length nrow(x).", call. = FALSE) + } + if (length(y_ids) != nrow(y)) { + stop("Identifier column `y_id` must have length nrow(y).", call. = FALSE) + } + + if (anyNA(x_ids) || anyNA(y_ids)) { + warning( + "Missing values found in `x_id` or `y_id`; correctness of matched pairs ", + "cannot be fully evaluated for those records.", + call. = FALSE + ) + } + + # weights -------------------------------------------------------------------- + if (is.null(weights)) { + weights <- rep(1, length(vars)) + } else { + if (!is.numeric(weights) || length(weights) != length(vars)) { + stop("`weights` must be numeric with length equal to `length(vars)`.", + call. = FALSE) + } + if (any(!is.finite(weights)) || any(weights < 0)) { + stop("`weights` must contain finite non-negative values.", call. = FALSE) + } + if (sum(weights) == 0) { + stop("At least one weight must be positive.", call. = FALSE) + } + } + + # subset linkage variables --------------------------------------------------- + x_sub <- x[, vars, drop = FALSE] + y_sub <- y[, vars, drop = FALSE] + + harmonized <- .harmonize_linkage_data(x_sub, y_sub) + x_sub <- harmonized$x + y_sub <- harmonized$y + + # metric-specific checks ----------------------------------------------------- + if (distance %in% c("euclidean", "manhattan")) { + is_numeric_like <- vapply( + x_sub, + function(z) is.numeric(z) || is.integer(z) || is.logical(z), + logical(1L) + ) & + vapply( + y_sub, + function(z) is.numeric(z) || is.integer(z) || is.logical(z), + logical(1L) + ) + + if (!all(is_numeric_like)) { + bad_vars <- vars[!is_numeric_like] + stop( + "For `distance = '", distance, "'`, all linkage variables must be ", + "numeric, integer, or logical. Non-numeric variables found: ", + paste(bad_vars, collapse = ", "), + ". Use `distance = 'gower'` for mixed-type data.", + call. = FALSE + ) + } + + x_sub[] <- lapply(x_sub, function(z) as.numeric(z)) + y_sub[] <- lapply(y_sub, function(z) as.numeric(z)) + } + + # missingness handling ------------------------------------------------------- + if (identical(na_action, "fail")) { + if (anyNA(x_sub) || anyNA(y_sub)) { + stop("Missing values found in linkage variables and `na_action = 'fail'`.", + call. = FALSE) + } + } + + # pairwise distance matrix --------------------------------------------------- + combined <- rbind(x_sub, y_sub) + + d_all <- cluster::daisy( + x = combined, + metric = distance, + weights = weights + ) + + d_mat <- as.matrix(d_all) + n_x <- nrow(x_sub) + n_y <- nrow(y_sub) + + cost <- d_mat[seq_len(n_x), n_x + seq_len(n_y), drop = FALSE] + + dimnames(cost) <- list(seq_len(n_x), seq_len(n_y)) + + if (any(!is.finite(cost))) { + stop( + "Non-finite entries found in the pairwise distance matrix. ", + "This can occur when missing linkage values prevent distance computation ", + "for one or more record pairs.", + call. = FALSE + ) + } + + # tied best candidates based on cost matrix ---------------------------------- + row_min <- apply(cost, 1L, min) + + n_best <- vapply( + seq_len(nrow(cost)), + function(i) { + sum(abs(cost[i, ] - row_min[i]) <= tol) + }, + integer(1L) + ) + + # Hungarian assignment ------------------------------------------------------- + assignment <- clue::solve_LSAP(cost, maximum = FALSE) + + matched_y_index <- as.integer(assignment) + matched_cost <- cost[cbind(seq_len(n_x), matched_y_index)] + + matched_x_ids <- x_ids + matched_y_ids <- y_ids[matched_y_index] + + correct <- matched_x_ids == matched_y_ids + + matches <- data.frame( + x_row = seq_len(n_x), + y_row = matched_y_index, + x_id = matched_x_ids, + y_id = matched_y_ids, + distance = matched_cost, + correct_match = correct, + n_best = n_best, + stringsAsFactors = FALSE + ) + + correct_matches <- sum(correct, na.rm = TRUE) + + n_evaluable <- sum(!is.na(correct)) + + correct_match_rate <- if (n_evaluable > 0L) { + mean(correct, na.rm = TRUE) + } else { + NA_real_ + } + + out <- list( + matches = matches, + correct_matches = correct_matches, + correct_match_rate = correct_match_rate, + mean_distance = mean(matched_cost), + total_distance = sum(matched_cost), + call = match.call() + ) + + if (isTRUE(return_matrix)) { + out$distance_matrix <- cost + } + + class(out) <- "recordLinkage" + out +} + +# harmonize variable classes across files ------------------------------------ +#' Harmonize linkage variables across two data.frames +#' +#' Internal helper that coerces corresponding variables in two data.frames to +#' compatible classes for distance computation. +#' +#' @param x A data.frame. +#' @param y A data.frame. +#' +#' @return A list with harmonized `x` and `y`. +#' @noRd +.harmonize_linkage_data <- function(x, y) { + + out_x <- x + out_y <- y + + for (j in seq_along(out_x)) { + xj <- out_x[[j]] + yj <- out_y[[j]] + var_name <- names(out_x)[j] + + # ordered factors -------------------------------------------------------- + if (is.ordered(xj) || is.ordered(yj)) { + + # both ordered: require identical level order + if (is.ordered(xj) && is.ordered(yj)) { + lev_x <- levels(xj) + lev_y <- levels(yj) + + if (!identical(lev_x, lev_y)) { + stop( + "Ordered factor levels for variable `", var_name, + "` are not identical in `x` and `y`.", + call. = FALSE + ) + } + + lev <- lev_x + + # one ordered, one unordered/character: use ordered levels as reference + } else if (is.ordered(xj)) { + lev <- levels(xj) + + extra_y <- setdiff(unique(as.character(yj[!is.na(yj)])), lev) + if (length(extra_y) > 0L) { + stop( + "Variable `", var_name, "` is ordered in `x`, but `y` contains ", + "values not present in the ordered levels: ", + paste(extra_y, collapse = ", "), + call. = FALSE + ) + } + + } else { # is.ordered(yj) + lev <- levels(yj) + + extra_x <- setdiff(unique(as.character(xj[!is.na(xj)])), lev) + if (length(extra_x) > 0L) { + stop( + "Variable `", var_name, "` is ordered in `y`, but `x` contains ", + "values not present in the ordered levels: ", + paste(extra_x, collapse = ", "), + call. = FALSE + ) + } + } + + out_x[[j]] <- ordered(as.character(xj), levels = lev) + out_y[[j]] <- ordered(as.character(yj), levels = lev) + + # unordered factor / character ----------------------------------------- + } else if (is.factor(xj) || is.factor(yj) || + is.character(xj) || is.character(yj)) { + + lev <- union(as.character(unique(xj)), as.character(unique(yj))) + lev <- lev[!is.na(lev)] + + out_x[[j]] <- factor(as.character(xj), levels = lev) + out_y[[j]] <- factor(as.character(yj), levels = lev) + + # logical -------------------------------------------------------------- + } else if (is.logical(xj) || is.logical(yj)) { + out_x[[j]] <- as.logical(xj) + out_y[[j]] <- as.logical(yj) + + # numeric / integer ---------------------------------------------------- + } else if ((is.numeric(xj) || is.integer(xj)) && + (is.numeric(yj) || is.integer(yj))) { + out_x[[j]] <- as.numeric(xj) + out_y[[j]] <- as.numeric(yj) + + } else { + stop( + "Unsupported or incompatible variable classes for variable `", + var_name, "`: ", + paste(class(xj), collapse = "/"), " vs ", + paste(class(yj), collapse = "/"), + call. = FALSE + ) + } + } + + list(x = out_x, y = out_y) +} + +#' @method print recordLinkage +#' @export +print.recordLinkage <- function(x, digits = 2, ...) { + n_evaluable <- sum(!is.na(x$matches$correct_match)) + + cat("\n") + cat("Correct matches: ", + x$correct_matches, "/", n_evaluable, "\n", sep = "") + cat("Correct match percent: ", + format(round(x$correct_match_rate * 100, digits), nsmall = digits), "%\n", sep = "") + cat("Mean distance: ", + format(x$mean_distance, digits = 6), "\n", sep = "") + invisible(x) +} \ No newline at end of file diff --git a/man/recordLinkage.Rd b/man/recordLinkage.Rd new file mode 100644 index 00000000..24ff21ba --- /dev/null +++ b/man/recordLinkage.Rd @@ -0,0 +1,149 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recordLinkage.R +\docType{methods} +\name{recordLinkage} +\alias{recordLinkage} +\title{Record linkage via Global Distance-Based Record Linkage} +\usage{ +recordLinkage( + x, + y, + vars, + distance = c("gower", "euclidean", "manhattan"), + weights = NULL, + x_id = NULL, + y_id = NULL, + return_matrix = FALSE, + na_action = c("ignore", "fail"), + tol = sqrt(.Machine$double.eps) +) +} +\arguments{ +\item{x}{A `data.frame` containing the original data.} + +\item{y}{A `data.frame` containing the anonymized/protected data.} + +\item{vars}{Character vector of variable names used for record linkage. +These variables must exist in both `x` and `y`.} + +\item{distance}{Character string specifying the distance metric. One of +`"gower"` (default), `"euclidean"`, or `"manhattan"`.} + +\item{weights}{Optional numeric vector of variable weights passed to +[cluster::daisy()]. Must have length `length(vars)`. If `NULL`, equal +weights are used.} + +\item{x_id}{Optional single character string naming the identifier column in +`x`. If `NULL`, row numbers are used as truth IDs.} + +\item{y_id}{Optional single character string naming the identifier column in +`y`. If `NULL`, row numbers are used as truth IDs.} + +\item{return_matrix}{Logical; if `TRUE`, the full pairwise distance matrix is +returned.} + +\item{na_action}{Character string specifying how to handle missing values in +linkage variables. One of: +\describe{ + \item{`ignore`}{retain missing values and compute pairwise distances + using the subset of linkage variables observed for each record pair, as + handled by [cluster::daisy()]. Distances for different record pairs may + therefore be based on different numbers of variables. Missing values are + not treated as a separate category and do not contribute directly to the + corresponding variable-specific distance.} + \item{`fail`}{stop if linkage variables contain any missing values.} +}} + +\item{tol}{Numeric tolerance used to determine tied minimum distances.} +} +\value{ +An object of class `"recordLinkage"` with elements: +\describe{ + \item{matches}{A data.frame with matched pairs and corresponding distances} + \item{correct_matches}{Number of correctly linked records} + \item{correct_match_rate}{Proportion of correctly linked records} + \item{mean_distance}{Mean matched distance} + \item{total_distance}{Total matched distance} + \item{distance_matrix}{Optional full pairwise distance matrix} + \item{call}{The matched call} +} +} +\description{ +Implements the Global Distance-Based Record Linkage +(GDBRL; Herranz et al., 2015), which links records in an original dataset to +records in an anonymized/protected dataset by computing pairwise distances on +selected linkage variables and then finding the minimum-total-distance +one-to-one matching via the Hungarian algorithm. + +This corresponds to an attacker scenario in which the adversary knows the +original data, or equivalent external information, for the linkage variables +and uses the released protected dataset to infer the most plausible global +record-to-record matching. +} +\details{ +The distance measure can be chosen via `distance`. Gower distance is suitable +for mixed-type quasi-identifiers, including numeric, factor, character, and +logical variables. Variables of class factor are treated as nominal variables, +while variables of class ordered are treated as ordinal variables. +Euclidean and Manhattan distances are supported for purely +numeric linkage variables. The Hungarian algorithm finds the global +minimum-cost one-to-one assignment. + +In addition to the global assignment, the function also returns the number of +candidates attaining the minimum distance (`n_best`). The quantity +`n_best` counts, for each record in `x`, how many records in `y` attain the +same minimum row-wise distance in the pairwise distance matrix. If multiple +optimal assignments exist, the chosen solution depends on the deterministic +behavior of [clue::solve_LSAP()] for the supplied cost matrix. + +For strict global assignment, `nrow(x)` must equal `nrow(y)`. +If `x_id` and `y_id` are not supplied, row order is treated as the truth for +evaluating correct matches. + +Results depend on both the matching direction and the row order of the input +data frames. +} +\examples{ +x <- data.frame( + id = c(1, 2, 3), + age = c(23, 40, 35), + sex = factor(c("f", "m", "f")), + region = c("A", "B", "A"), + stringsAsFactors = FALSE +) + +y <- data.frame( + id = c(1, 2, 3), + age = c(24, 39, 35), + sex = factor(c("f", "m", "f")), + region = c("A", "B", "B"), + stringsAsFactors = FALSE +) + +out <- recordLinkage( + x = x, + y = y, + vars = c("age", "sex", "region"), + distance = "gower", + x_id = "id", + y_id = "id" +) + +out +out$matches + +} +\references{ +Herranz, J., Nin, J., Rodríguez, P., and Tassa, T. (2015). \emph{Revisiting +distance-based record linkage for privacy-preserving release of statistical +datasets}. Data & Knowledge Engineering, 100, 78--93. +\doi{10.1016/j.datak.2015.07.009} + +Hornik, K. (2005). \emph{A CLUE for cluster ensembles}. +Journal of Statistical Software, 14(12). +\doi{10.18637/jss.v014.i12} + +Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., & Hornik, K. (2026). +\emph{cluster: Cluster Analysis Basics and Extensions}. +\url{https://CRAN.R-project.org/package=cluster} +} diff --git a/tests/testthat/test_recordLinkage.R b/tests/testthat/test_recordLinkage.R new file mode 100644 index 00000000..7281f7e5 --- /dev/null +++ b/tests/testthat/test_recordLinkage.R @@ -0,0 +1,835 @@ +# Tests for recordLinkage Function + +# load library +library(testthat) + +testthat::skip_if_not_installed("clue") +testthat::skip_if_not_installed("cluster") + +test_that("recordLinkage returns correct match rate (example 1 from Harrenz et al. (2015))", { + x <- data.frame( + v1 = c(1, 0, -1, 0), + v2 = c(0, 1, 0, -1), + id = 1:4 + ) + + y <- data.frame( + v1 = c(0, 0, -2.1, 0), + v2 = c(0, 2.1, 0, -2.1), + id = 1:4 + ) + + out <- recordLinkage(x, + y, + x_id = "id", + y_id = "id", + na_action = "ignore", + return_matrix = TRUE, + vars = c("v1", "v2")) + + expect_equal(out$correct_match_rate, 1) +}) + +test_that("recordLinkage returns correct match rate (example 2 from Harrenz et al. (2015))", { + x <- data.frame( + v1 = c(1, 2, 3, 4), + id = 1:4 + ) + + y <- data.frame( + v1 = c(2, 3, 4, -0.1), + id = 1:4 + ) + + out <- recordLinkage(x, + y, + x_id = "id", + y_id = "id", + na_action = "ignore", + return_matrix = TRUE, + vars = c("v1")) + + expect_equal(out$correct_match_rate, 0) +}) + +test_that("gower distance matrix is computed as expected for mixed-type linkage variables", { + x <- data.frame( + id = 1:2, + age = c(20, 40), + sex = factor(c("f", "m")), + region = c("A", "B"), + stringsAsFactors = FALSE + ) + + y <- data.frame( + id = 1:2, + age = c(20, 30), + sex = factor(c("f", "m")), + region = c("B", "B"), + stringsAsFactors = FALSE + ) + + out <- recordLinkage( + x = x, + y = y, + vars = c("age", "sex", "region"), + distance = "gower", + x_id = "id", + y_id = "id", + return_matrix = TRUE, + tol = 0 + ) + + expect_true(is.matrix(out$distance_matrix)) + expect_equal(dim(out$distance_matrix), c(2, 2)) + + # Manual Gower distances: + # age range over combined data = 40 - 20 = 20 + # + # x1=(20,f,A) vs y1=(20,f,B): + # age = 0/20 = 0 + # sex = 0 + # region = 1 + # mean = (0 + 0 + 1) / 3 = 1/3 + # + # x1=(20,f,A) vs y2=(30,m,B): + # age = 10/20 = 0.5 + # sex = 1 + # region = 1 + # mean = (0.5 + 1 + 1) / 3 = 5/6 + # + # x2=(40,m,B) vs y1=(20,f,B): + # age = 20/20 = 1 + # sex = 1 + # region = 0 + # mean = 2/3 + # + # x2=(40,m,B) vs y2=(30,m,B): + # age = 10/20 = 0.5 + # sex = 0 + # region = 0 + # mean = 1/6 + expected <- matrix( + c( + 1/3, 5/6, + 2/3, 1/6 + ), + nrow = 2, + byrow = TRUE, + dimnames = list(c("1", "2"), c("1", "2")) + ) + + expect_equal(out$distance_matrix, expected, tolerance = 1e-8) + + # Optimal assignment should be x1->y1 and x2->y2 + expect_equal(out$matches$y_row, c(1L, 2L)) + expect_true(all(out$matches$correct_match)) + expect_equal(out$correct_match_rate, 1) +}) + +test_that("weighted gower distance matrix is computed as expected for mixed-type linkage variables", { + x <- data.frame( + id = 1:2, + age = c(20, 40), + sex = factor(c("f", "m")), + region = c("A", "B"), + stringsAsFactors = FALSE + ) + + y <- data.frame( + id = 1:2, + age = c(20, 30), + sex = factor(c("f", "m")), + region = c("B", "B"), + stringsAsFactors = FALSE + ) + + w <- c(age = 2, sex = 1, region = 1) + + out <- recordLinkage( + x = x, + y = y, + vars = c("age", "sex", "region"), + distance = "gower", + weights = w, + x_id = "id", + y_id = "id", + return_matrix = TRUE, + tol = 0 + ) + + expect_true(is.matrix(out$distance_matrix)) + expect_equal(dim(out$distance_matrix), c(2, 2)) + + # Manual weighted Gower distances: + # age range over combined data = 40 - 20 = 20 + # weights sum = 2 + 1 + 1 = 4 + # + # x1=(20,f,A) vs y1=(20,f,B): + # age = 0/20 = 0, weighted = 2*0 + # sex = 0, weighted = 1*0 + # region = 1, weighted = 1*1 + # total = (0 + 0 + 1) / 4 = 1/4 + # + # x1=(20,f,A) vs y2=(30,m,B): + # age = 10/20 = 0.5, weighted = 2*0.5 = 1 + # sex = 1, weighted = 1 + # region = 1, weighted = 1 + # total = (1 + 1 + 1) / 4 = 3/4 + # + # x2=(40,m,B) vs y1=(20,f,B): + # age = 20/20 = 1, weighted = 2 + # sex = 1, weighted = 1 + # region = 0, weighted = 0 + # total = (2 + 1 + 0) / 4 = 3/4 + # + # x2=(40,m,B) vs y2=(30,m,B): + # age = 10/20 = 0.5, weighted = 1 + # sex = 0, weighted = 0 + # region = 0, weighted = 0 + # total = (1 + 0 + 0) / 4 = 1/4 + + expected <- matrix( + c(1/4, 3/4, + 3/4, 1/4), + nrow = 2, + byrow = TRUE, + dimnames = list(c("1", "2"), c("1", "2")) + ) + + expect_equal(out$distance_matrix, expected, tolerance = 1e-8) + + # Optimal assignment should be x1->y1 and x2->y2 + expect_equal(out$matches$y_row, c(1L, 2L)) + expect_true(all(out$matches$correct_match)) + expect_equal(out$correct_match_rate, 1) +}) + +test_that("Hungarian assignment finds the global minimum one-to-one matching", { + x <- data.frame( + id = 1:3, + a = c(0, 2, 100) + ) + + y <- data.frame( + id = 1:3, + a = c(0, 100, 101) + ) + + out <- recordLinkage( + x = x, + y = y, + vars = "a", + distance = "euclidean", + x_id = "id", + y_id = "id", + return_matrix = TRUE + ) + + # Cost matrix should be: + # y1 y2 y3 + # x1(0) 0 100 101 + # x2(2) 2 98 99 + # x3(100) 100 0 1 + + expected <- matrix( + c( + 0, 100, 101, + 2, 98, 99, + 100, 0, 1 + ), + nrow = 3, + byrow = TRUE, + dimnames = list(c("1", "2", "3"), c("1", "2", "3")) + ) + + expect_equal(out$distance_matrix, expected, tolerance = 1e-8) + + # Greedy row-wise would pick x1->y1, x2->y1, x3->y2. + # Under one-to-one assignment, the global optimum is: + # x1 -> y1 (0) + # x2 -> y3 (99) + # x3 -> y2 (0) + # total = 99 + # + # Alternative feasible assignment x1->y1, x2->y2, x3->y3 totals 99 as well. + # So the key property to test is that: + # 1) assignment is one-to-one + # 2) total distance is globally minimal + expect_equal(sort(out$matches$y_row), 1:3) + expect_equal(out$total_distance, 99, tolerance = 1e-8) +}) + +test_that("recordLinkage returns expected structure for a simple gower match", { + x <- data.frame( + id = c(1, 2, 3), + age = c(23, 40, 35), + sex = factor(c("f", "m", "f")), + region = c("A", "B", "A"), + stringsAsFactors = FALSE + ) + + y <- data.frame( + id = c(1, 2, 3), + age = c(24, 39, 35), + sex = factor(c("f", "m", "f")), + region = c("A", "B", "B"), + stringsAsFactors = FALSE + ) + + out <- recordLinkage( + x = x, + y = y, + vars = c("age", "sex", "region"), + distance = "gower", + x_id = "id", + y_id = "id" + ) + + expect_s3_class(out, "recordLinkage") + expect_type(out, "list") + + expect_named( + out, + c( + "matches", + "correct_matches", + "correct_match_rate", + "mean_distance", + "total_distance", + "call" + ), + ignore.order = TRUE + ) + + expect_s3_class(out$matches, "data.frame") + expect_equal( + names(out$matches), + c("x_row", "y_row", "x_id", "y_id", "distance", "correct_match", "n_best") + ) + expect_equal(nrow(out$matches), 3L) + + expect_true(is.numeric(out$correct_matches)) + expect_true(is.numeric(out$correct_match_rate)) + expect_true(is.numeric(out$mean_distance)) + expect_true(is.numeric(out$total_distance)) + + expect_equal(out$correct_matches, sum(out$matches$correct_match, na.rm = TRUE)) + expect_equal(out$correct_match_rate, mean(out$matches$correct_match, na.rm = TRUE)) + expect_equal(out$mean_distance, mean(out$matches$distance)) + expect_equal(out$total_distance, sum(out$matches$distance)) +}) + +test_that("recordLinkage uses row order as truth when x_id and y_id are NULL", { + x <- data.frame(a = c(1, 2), b = c("x", "y"), stringsAsFactors = FALSE) + y <- data.frame(a = c(1, 2), b = c("x", "y"), stringsAsFactors = FALSE) + + out <- recordLinkage( + x = x, + y = y, + vars = c("a", "b") + ) + + expect_equal(out$matches$x_id, 1:2) + expect_equal(out$matches$y_id, 1:2) + expect_true(all(out$matches$correct_match)) + expect_equal(out$correct_matches, 2) + expect_equal(out$correct_match_rate, 1) +}) + +test_that("recordLinkage returns distance_matrix when requested", { + x <- data.frame(id = 1:3, v = c(1, 2, 3)) + y <- data.frame(id = 1:3, v = c(1, 2, 4)) + + out <- recordLinkage( + x = x, + y = y, + vars = "v", + distance = "euclidean", + x_id = "id", + y_id = "id", + return_matrix = TRUE + ) + + expect_true("distance_matrix" %in% names(out)) + expect_true(is.matrix(out$distance_matrix)) + expect_equal(dim(out$distance_matrix), c(3, 3)) + + expect_equal( + out$matches$distance, + out$distance_matrix[cbind(seq_len(nrow(x)), out$matches$y_row)] + ) +}) + +test_that("recordLinkage validates x and y as data.frames", { + y <- data.frame(a = 1) + + expect_error( + recordLinkage(x = 1, y = y, vars = "a"), + "`x` must be a data.frame.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x = y, y = 1, vars = "a"), + "`y` must be a data.frame.", + fixed = TRUE + ) +}) + +test_that("recordLinkage validates vars", { + x <- data.frame(a = 1:2) + y <- data.frame(a = 1:2) + + expect_error( + recordLinkage(x, y, vars = NULL), + "`vars` must be a non-empty character vector.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = character()), + "`vars` must be a non-empty character vector.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = 1), + "`vars` must be a non-empty character vector.", + fixed = TRUE + ) +}) + +test_that("recordLinkage errors when vars are missing from x or y", { + x <- data.frame(a = 1:2, b = 3:4) + y <- data.frame(a = 1:2, c = 3:4) + + expect_error( + recordLinkage(x, y, vars = c("a", "c")), + "Variables not found in `x`: c", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = c("a", "b")), + "Variables not found in `y`: b", + fixed = TRUE + ) +}) + +test_that("recordLinkage errors when x and y have different number of rows", { + x <- data.frame(a = 1:2) + y <- data.frame(a = 1:3) + + expect_error( + recordLinkage(x, y, vars = "a"), + "`x` and `y` must have the same number of rows for strict one-to-one Hungarian assignment.", + fixed = TRUE + ) +}) + +test_that("recordLinkage validates tol", { + x <- data.frame(a = 1:2) + y <- data.frame(a = 1:2) + + expect_error( + recordLinkage(x, y, vars = "a", tol = -1), + "`tol` must be a single non-negative finite number.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = "a", tol = c(1, 2)), + "`tol` must be a single non-negative finite number.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = "a", tol = NA_real_), + "`tol` must be a single non-negative finite number.", + fixed = TRUE + ) +}) + +test_that("recordLinkage validates x_id and y_id", { + x <- data.frame(id = 1:2, a = 1:2) + y <- data.frame(id = 1:2, a = 1:2) + + expect_error( + recordLinkage(x, y, vars = "a", x_id = 1), + "`x_id` must be NULL or a single column name.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = "a", y_id = 1), + "`y_id` must be NULL or a single column name.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = "a", x_id = "zzz"), + "`x_id` not found in `x`.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = "a", y_id = "zzz"), + "`y_id` not found in `y`.", + fixed = TRUE + ) +}) + +test_that("recordLinkage validates weights", { + x <- data.frame(a = 1:2, b = 3:4) + y <- data.frame(a = 1:2, b = 3:4) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), weights = 1), + "`weights` must be numeric with length equal to `length(vars)`.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), weights = c(1, NA)), + "`weights` must contain finite non-negative values.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), weights = c(1, -1)), + "`weights` must contain finite non-negative values.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), weights = c(0, 0)), + "At least one weight must be positive.", + fixed = TRUE + ) +}) + +test_that("recordLinkage respects na_action = 'fail'", { + x <- data.frame(a = c(1, NA)) + y <- data.frame(a = c(1, 2)) + + expect_error( + recordLinkage(x, y, vars = "a", na_action = "fail"), + "Missing values found in linkage variables and `na_action = 'fail'`.", + fixed = TRUE + ) +}) + +test_that("recordLinkage allows NA values when na_action = 'ignore' if pairwise distances remain computable", { + x <- data.frame( + id = 1:2, + a = c(1, NA), + b = factor(c("x", "y")) + ) + + y <- data.frame( + id = 1:2, + a = c(1, NA), + b = factor(c("x", "y")) + ) + + out <- recordLinkage( + x, y, + vars = c("a", "b"), + x_id = "id", + y_id = "id", + na_action = "ignore" + ) + + expect_s3_class(out, "recordLinkage") + expect_equal(nrow(out$matches), 2L) + expect_true(all(is.finite(out$matches$distance))) +}) + +test_that("correct_match is NA when truth IDs cannot be evaluated", { + x <- data.frame( + id = c(1, NA, 3), + age = c(20, 30, 40), + sex = factor(c("f", "m", "f")) + ) + + y <- data.frame( + id = c(1, 2, 3), + age = c(20, 30, 40), + sex = factor(c("f", "m", "f")) + ) + + out <- suppressWarnings( + recordLinkage( + x = x, + y = y, + vars = c("age", "sex"), + distance = "gower", + x_id = "id", + y_id = "id", + na_action = "ignore" + ) + ) + + expect_true(anyNA(out$matches$correct_match)) + expect_equal(out$correct_matches, sum(out$matches$correct_match, na.rm = TRUE)) +}) + +test_that("correct_match_rate is NA if no matched pairs are evaluable", { + x <- data.frame( + id = c(NA, NA), + age = c(20, 30) + ) + + y <- data.frame( + id = c(NA, NA), + age = c(20, 30) + ) + + out <- suppressWarnings( + recordLinkage( + x = x, + y = y, + vars = "age", + distance = "gower", + x_id = "id", + y_id = "id", + na_action = "ignore" + ) + ) + + expect_true(all(is.na(out$matches$correct_match))) + expect_true(is.na(out$correct_match_rate)) + expect_equal(out$correct_matches, 0) +}) + +test_that("euclidean and manhattan require numeric/integer/logical linkage variables", { + x <- data.frame(a = 1:2, b = c("x", "y"), stringsAsFactors = FALSE) + y <- data.frame(a = 1:2, b = c("x", "y"), stringsAsFactors = FALSE) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), distance = "euclidean"), + "Use `distance = 'gower'` for mixed-type data.", + fixed = TRUE + ) + + expect_error( + recordLinkage(x, y, vars = c("a", "b"), distance = "manhattan"), + "Use `distance = 'gower'` for mixed-type data.", + fixed = TRUE + ) +}) + +test_that("euclidean and manhattan work with numeric, integer, and logical variables", { + x <- data.frame( + id = 1:3, + a = c(0, 1, 2), + b = c(TRUE, FALSE, TRUE) + ) + y <- data.frame( + id = 1:3, + a = c(0, 1, 2), + b = c(TRUE, FALSE, TRUE) + ) + + out_euc <- recordLinkage( + x, y, + vars = c("a", "b"), + distance = "euclidean", + x_id = "id", + y_id = "id" + ) + + out_man <- recordLinkage( + x, y, + vars = c("a", "b"), + distance = "manhattan", + x_id = "id", + y_id = "id" + ) + + expect_equal(out_euc$correct_matches, 3) + expect_equal(out_man$correct_matches, 3) + expect_true(all(out_euc$matches$correct_match)) + expect_true(all(out_man$matches$correct_match)) +}) + +test_that("n_best counts row-wise tied minima from the original cost matrix", { + x <- data.frame(id = 1:2, a = c(0, 1)) + y <- data.frame(id = 1:2, a = c(0, 0)) + + out <- recordLinkage( + x, y, + vars = "a", + distance = "euclidean", + x_id = "id", + y_id = "id", + tol = 0 + ) + + expect_equal(out$matches$n_best, c(2L, 2L)) +}) + +test_that("zero tolerance means only exact ties are counted in n_best", { + x <- data.frame(id = 1:2, a = c(0, 2)) + y <- data.frame(id = 1:2, a = c(0, 1e-7)) + + out <- recordLinkage( + x, y, + vars = "a", + distance = "euclidean", + x_id = "id", + y_id = "id", + tol = 0 + ) + + expect_equal(out$matches$n_best[1], 1L) +}) + +test_that("unordered factors and characters are harmonized to common nominal levels", { + x <- data.frame( + v = factor(c("a", "b", NA), levels = c("a", "b")) + ) + + y <- data.frame( + v = c("b", "c", NA), + stringsAsFactors = FALSE + ) + + out <- .harmonize_linkage_data(x, y) + + expect_true(is.factor(out$x$v)) + expect_true(is.factor(out$y$v)) + expect_false(is.ordered(out$x$v)) + expect_false(is.ordered(out$y$v)) + + expect_identical(levels(out$x$v), c("a", "b", "c")) + expect_identical(levels(out$y$v), c("a", "b", "c")) + + expect_true(is.na(out$x$v[3])) + expect_true(is.na(out$y$v[3])) +}) + +test_that("ordered factors with identical levels are preserved as ordered", { + lev <- c("low", "medium", "high") + + x <- data.frame( + v = ordered(c("low", "high", NA), levels = lev) + ) + + y <- data.frame( + v = ordered(c("medium", "high", NA), levels = lev) + ) + + out <- .harmonize_linkage_data(x, y) + + expect_true(is.ordered(out$x$v)) + expect_true(is.ordered(out$y$v)) + expect_identical(levels(out$x$v), lev) + expect_identical(levels(out$y$v), lev) + + expect_true(is.na(out$x$v[3])) + expect_true(is.na(out$y$v[3])) +}) + + +test_that("ordered factors with different level orders throw an error", { + x <- data.frame( + v = ordered(c("low", "high"), levels = c("low", "medium", "high")) + ) + + y <- data.frame( + v = ordered(c("medium", "high"), levels = c("medium", "low", "high")) + ) + + expect_error( + .harmonize_linkage_data(x, y), + "Ordered factor levels for variable `v` are not identical in `x` and `y`." + ) +}) + +test_that("ordered variable in x and character variable in y are harmonized using x levels", { + x <- data.frame( + v = ordered(c("low", "high", NA), levels = c("low", "medium", "high")) + ) + + y <- data.frame( + v = c("medium", "high", NA), + stringsAsFactors = FALSE + ) + + out <- .harmonize_linkage_data(x, y) + + expect_true(is.ordered(out$x$v)) + expect_true(is.ordered(out$y$v)) + expect_identical(levels(out$x$v), c("low", "medium", "high")) + expect_identical(levels(out$y$v), c("low", "medium", "high")) +}) + +test_that("ordered variable and incompatible character values throw an error", { + x <- data.frame( + v = ordered(c("low", "high"), levels = c("low", "medium", "high")) + ) + + y <- data.frame( + v = c("medium", "very_high"), + stringsAsFactors = FALSE + ) + + expect_error( + .harmonize_linkage_data(x, y), + "Variable `v` is ordered in `x`, but `y` contains values not present in the ordered levels: very_high" + ) +}) + +test_that("logical variables are preserved as logical", { + x <- data.frame(v = c(TRUE, FALSE, NA)) + y <- data.frame(v = c(FALSE, TRUE, NA)) + + out <- .harmonize_linkage_data(x, y) + + expect_true(is.logical(out$x$v)) + expect_true(is.logical(out$y$v)) + expect_true(is.na(out$x$v[3])) + expect_true(is.na(out$y$v[3])) +}) + +test_that("numeric and integer variables are harmonized to numeric", { + x <- data.frame(v = c(1L, 2L, NA)) + y <- data.frame(v = c(1.5, 2.5, NA)) + + out <- .harmonize_linkage_data(x, y) + + expect_true(is.numeric(out$x$v)) + expect_true(is.numeric(out$y$v)) + expect_equal(out$x$v[1], 1) + expect_equal(out$y$v[1], 1.5) +}) + +test_that("unsupported variable class combinations throw an error", { + x <- data.frame(v = I(list(1, 2))) + y <- data.frame(v = I(list(1, 2))) + + expect_error( + .harmonize_linkage_data(x, y), + "Unsupported or incompatible variable classes for variable `v`" + ) +}) + +test_that("print.recordLinkage prints key summary fields invisibly", { + x <- data.frame(id = 1:2, a = c(1, 2)) + y <- data.frame(id = 1:2, a = c(1, 2)) + + out <- recordLinkage( + x, y, + vars = "a", + distance = "euclidean", + x_id = "id", + y_id = "id" + ) + + expect_output(print(out), "Correct matches:", fixed = TRUE) + expect_output(print(out), "Correct match percent:", fixed = TRUE) + expect_output(print(out), "Mean distance:", fixed = TRUE) + expect_invisible(print(out)) +}) \ No newline at end of file