From eee4229dce5d0e6e59dd278b9a776f6260e6725e Mon Sep 17 00:00:00 2001 From: xuefliang Date: Sun, 3 May 2026 21:59:04 +0800 Subject: [PATCH] feat:Compute local collinearity diagnostics including VIF, CN, and VDP --- R/gwr_collin_diag.R | 324 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100755 R/gwr_collin_diag.R diff --git a/R/gwr_collin_diag.R b/R/gwr_collin_diag.R new file mode 100755 index 0000000..39fe2c8 --- /dev/null +++ b/R/gwr_collin_diag.R @@ -0,0 +1,324 @@ +#' GWR Collinearity Diagnostics +#' +#' Compute local collinearity diagnostics including VIF, CN, and VDP +#' for geographically weighted regression. This implementation follows +#' the methodology of GWmodel::gwr.collin.diagno(). +#' +#' @param formula Regression model formula. +#' @param data A `sf` object. +#' @param bw Bandwidth value. +#' @param adaptive Whether the bandwidth is adaptive (number of neighbors). +#' @param kernel Kernel function: "gaussian", "exp", "bisquare", "tricube", "boxcar". +#' @param longlat Whether coordinates are longitude/latitude. +#' @param p Power of Minkowski distance (2 = Euclidean). +#' @param theta Angle to rotate coordinates (radians). +#' @param dMat Optional pre-computed distance matrix. +#' +#' @return A list containing: +#' \describe{ +#' \item{SDF}{sf object with VIF, local_CN, VDP, and correlations} +#' \item{local_CN}{Local condition numbers} +#' \item{VIF}{Matrix of local VIF values (locations x variables, excluding intercept)} +#' \item{VDP}{Matrix of variance decomposition proportions for smallest singular value} +#' \item{corr.mat}{Local correlation coefficients between variable pairs} +#' } +#' +#' @details +#' Collinearity diagnostics interpretation: +#' \itemize{ +#' \item VIF > 10: indicates problematic collinearity +#' \item CN > 30: indicates severe collinearity +#' \item VDP: values > 0.5 for two or more variables with high CN indicate collinearity +#' } +#' +#' @references +#' Belsley, D.A., Kuh, E., and Welsch, R.E. (1980). Regression Diagnostics. +#' New York: John Wiley & Sons. +#' +#' Wheeler, D. and Tiefelsdorf, M. (2005). Multicollinearity and correlation +#' among local regression coefficients in geographically weighted regression. +#' Journal of Geographical Systems, 7(2), 161-187. +#' +#' @examples +#' data(LondonHP) +#' diag <- gwr_collin_diag( +#' PURCHASE ~ FLOORSZ + UNEMPLOY + PROF, +#' data = LondonHP, +#' bw = 64, +#' adaptive = TRUE +#' ) +#' print(diag) +#' +#' @importFrom stats model.frame model.extract model.matrix terms cov.wt +#' @export +gwr_collin_diag <- function( + formula, + data, + bw, + adaptive = FALSE, + kernel = c("bisquare", "gaussian", "exp", "tricube", "boxcar"), + longlat = FALSE, + p = 2.0, + theta = 0.0, + dMat = NULL +) { + kernel <- match.arg(kernel) + + ### Extract coords + coords <- as.matrix(sf::st_coordinates(sf::st_centroid(data))) + n <- nrow(coords) + + ### Extract variables + mf <- model.frame(formula = formula(formula), data = sf::st_drop_geometry(data)) + mt <- attr(mf, "terms") + y <- model.extract(mf, "response") + x <- model.matrix(mt, mf) + + has_intercept <- attr(terms(mf), "intercept") == 1 + indep_vars <- colnames(x) + indep_vars[which(indep_vars == "(Intercept)")] <- "Intercept" + colnames(x) <- indep_vars + + var_n <- ncol(x) # number of variables including intercept + + if (var_n <= 1) { + stop("The number of independent variables must be larger than one") + } + + # x1: design matrix without intercept (for VIF calculation) + if (has_intercept) { + x1 <- x[, -1, drop = FALSE] + } else { + x1 <- x + } + + ### Compute distance matrix if not provided + if (is.null(dMat)) { + if (longlat) { + # Haversine formula for great circle distance + lon_rad <- coords[, 1] * pi / 180 + lat_rad <- coords[, 2] * pi / 180 + + dMat <- matrix(0, n, n) + for (i in 1:(n - 1)) { + for (j in (i + 1):n) { + dlat <- lat_rad[j] - lat_rad[i] + dlon <- lon_rad[j] - lon_rad[i] + a <- sin(dlat / 2)^2 + cos(lat_rad[i]) * cos(lat_rad[j]) * sin(dlon / 2)^2 + dMat[i, j] <- 2 * 6371 * asin(sqrt(a)) # Earth radius 6371 km + dMat[j, i] <- dMat[i, j] + } + } + } else { + # Euclidean or Minkowski distance + if (p == 2.0 && theta == 0.0) { + dMat <- as.matrix(stats::dist(coords)) + } else { + # Minkowski with rotation + coords_rot <- coords + if (theta != 0) { + cos_t <- cos(theta) + sin_t <- sin(theta) + coords_rot[, 1] <- coords[, 1] * cos_t - coords[, 2] * sin_t + coords_rot[, 2] <- coords[, 1] * sin_t + coords[, 2] * cos_t + } + dMat <- as.matrix(stats::dist(coords_rot, method = "minkowski", p = p)) + } + } + } + + ### Kernel weight function + gw_weight <- function(dist, bw, kernel, adaptive) { + if (adaptive) { + # Adaptive: bw is number of neighbors + bw_dist <- sort(dist)[min(bw, length(dist))] + } else { + bw_dist <- bw + } + + w <- switch(kernel, + "gaussian" = exp(-0.5 * (dist / bw_dist)^2), + "exp" = exp(-dist / bw_dist), + "bisquare" = ifelse(dist <= bw_dist, (1 - (dist / bw_dist)^2)^2, 0), + "tricube" = ifelse(dist <= bw_dist, (1 - (dist / bw_dist)^3)^3, 0), + "boxcar" = ifelse(dist <= bw_dist, 1, 0) + ) + return(w) + } + + ### Initialize output matrices + # Correlation matrix: (var_n-1)*var_n/2 pairs + n_corr_pairs <- (var_n - 1) * var_n / 2 + corr_mat <- matrix(0, nrow = n, ncol = n_corr_pairs) + + # VIF matrix: var_n-1 variables (excluding intercept) + vifs_mat <- matrix(0, nrow = n, ncol = var_n - 1) + + # Condition index: var_n values per location + vdp_idx <- matrix(0, nrow = n, ncol = var_n) + + # VDP array: n x var_n x var_n + vdp_pi <- array(0, dim = c(n, var_n, var_n)) + + ### Compute diagnostics for each location + for (i in 1:n) { + # Get distances to location i + dist_vi <- dMat[, i] + + # Compute weights + W_i <- gw_weight(dist_vi, bw, kernel, adaptive) + + # Normalize weights (sum to 1) - as in GWmodel + sum_w <- sum(W_i) + Wi <- W_i / sum_w + + ### Local correlations between all variable pairs + tag <- 0 + for (j in 1:(var_n - 1)) { + for (k in (j + 1):var_n) { + tag <- tag + 1 + corr_mat[i, tag] <- cov.wt(cbind(x[, j], x[, k]), + wt = Wi, cor = TRUE)$cor[1, 2] + } + } + + ### VIF calculation using weighted correlation matrix (without intercept) + # This is the standard approach: VIF = diag(solve(R)) + # where R is the correlation matrix + tryCatch({ + corr_mati <- cov.wt(x1, wt = Wi, cor = TRUE)$cor + vifs_mat[i, ] <- diag(solve(corr_mati)) + }, error = function(e) { + vifs_mat[i, ] <- NA + }) + + ### Variance-decomposition proportions (VDP) + # Following Belsley, Kuh, and Welsch (1980) + # 1. Weight the design matrix: X * W (element-wise by row) + xw <- sweep(x, 1, Wi, "*") + + # 2. Column-scale to unit length: divide each column by its L2 norm + col_norms <- sqrt(colSums(xw^2)) + col_norms[col_norms == 0] <- 1 # avoid division by zero + xw_scaled <- sweep(xw, 2, col_norms, "/") + + # 3. SVD of scaled weighted design matrix + svd_x <- svd(xw_scaled) + + # 4. Condition indices: max(d) / d[k] + vdp_idx[i, ] <- svd_x$d[1] / pmax(svd_x$d, 1e-10) + + # 5. VDP calculation + # Phi = V * diag(1/d), then square and transpose + Phi <- svd_x$v %*% diag(1 / pmax(svd_x$d, 1e-10)) + Phi <- t(Phi^2) + + # 6. Normalize by column (prop.table by column) + col_sums <- colSums(Phi) + col_sums[col_sums == 0] <- 1 + pi_ij <- sweep(Phi, 2, col_sums, "/") + + vdp_pi[i, , ] <- pi_ij + } + + ### Create column names for correlations + corr_nms <- c() + for (i in 1:(var_n - 1)) { + for (j in (i + 1):var_n) { + corr_nms <- c(corr_nms, paste("Corr", paste(indep_vars[i], indep_vars[j], sep = "."), sep = "_")) + } + } + + ### Extract final outputs (following GWmodel convention) + # local_CN: the largest condition index (last column) + local_CN <- vdp_idx[, var_n] + + # VDP: proportions for the smallest singular value (last column of each pi_ij) + VDP <- vdp_pi[, var_n, ] + + ### Create result data frame + # VIF column names (without intercept) + vif_nms <- paste(indep_vars[-1], "VIF", sep = "_") + + # VDP column names (all variables) + vdp_nms <- paste(indep_vars, "VDP", sep = "_") + + res_df <- data.frame( + vifs_mat, + local_CN = local_CN, + VDP, + corr_mat + ) + colnames(res_df) <- c(vif_nms, "local_CN", vdp_nms, corr_nms) + + ### Create sf object + res_df$geometry <- sf::st_geometry(data) + SDF <- sf::st_sf(res_df) + + ### Return results + result <- list( + SDF = SDF, + corr.mat = corr_mat, + VIF = vifs_mat, + local_CN = local_CN, + VDP = VDP, + indep_vars = indep_vars, + call = match.call() + ) + class(result) <- "gwcollindiag" + result +} + +#' Print GWR Collinearity Diagnostics +#' @param x A gwcollindiag object +#' @param ... Additional arguments +#' @method print gwcollindiag +#' @export +print.gwcollindiag <- function(x, ...) { + cat("GWR Collinearity Diagnostics\n") + cat("============================\n") + cat("Formula:", deparse(x$call$formula), "\n") + cat("Variables:", paste(x$indep_vars, collapse = ", "), "\n\n") + + cat("Local Condition Number (CN) Summary:\n") + print(summary(x$local_CN)) + cat("\n") + + cat("Local VIF Summary (excluding intercept):\n") + vif_summary <- apply(x$VIF, 2, summary) + colnames(vif_summary) <- x$indep_vars[-1] + print(vif_summary) + cat("\n") + + n_high_cn <- sum(x$local_CN > 30, na.rm = TRUE) + n_high_vif <- sum(apply(x$VIF, 1, function(v) any(v > 10, na.rm = TRUE)), na.rm = TRUE) + + cat("Diagnostic Summary:\n") + cat(sprintf(" Locations with CN > 30: %d (%.1f%%)\n", + n_high_cn, 100 * n_high_cn / length(x$local_CN))) + cat(sprintf(" Locations with any VIF > 10: %d (%.1f%%)\n", + n_high_vif, 100 * n_high_vif / length(x$local_CN))) + + cat("\nLocal VDP Summary (for smallest singular value):\n") + vdp_summary <- apply(x$VDP, 2, summary) + colnames(vdp_summary) <- x$indep_vars + print(vdp_summary) +} + +#' Plot GWR Collinearity Diagnostics +#' @param x A gwcollindiag object +#' @param which Which plot: "CN" for condition number, "VIF" for VIF maps, "VDP" for VDP maps +#' @param ... Additional arguments passed to sf::plot +#' @method plot gwcollindiag +#' @export +plot.gwcollindiag <- function(x, which = "CN", ...) { + if (which == "CN") { + plot(x$SDF["local_CN"], main = "Local Condition Number", ...) + } else if (which == "VIF") { + vif_cols <- grep("_VIF$", names(x$SDF), value = TRUE) + plot(x$SDF[vif_cols], ...) + } else if (which == "VDP") { + vdp_cols <- grep("_VDP$", names(x$SDF), value = TRUE) + plot(x$SDF[vdp_cols], ...) + } +}