diff --git a/DESCRIPTION b/DESCRIPTION index bf4fec0..f3111b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: spmodel Title: Spatial Statistical Modeling and Prediction -Version: 0.10.0 +Version: 0.11.0 Authors@R: c( person(given = "Michael", family = "Dumelle", diff --git a/NEWS.md b/NEWS.md index d928390..3b0956c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,19 @@ +# spmodel 0.11.0 + +## Major Updates + +* Added support for Block Prediction (i.e., Block Kriging) to estimate averages (and their uncertainties) over a geographic region. +* Added a vignette to the [`spmodel` website](https://usepa.github.io/spmodel/) titled "Block Prediction (i.e., Block Kriging) in spmodel". +* Added a `fc_borders` data set which contains borders for the Four Corners states in the United States. + +## Bug Fixes + +* Fixed a bug that incorrectly calculated the log determinant of the fixed effects in the restricted log likelihood. +* Fixed a bug that caused an error when `covmatrix(..., cov_type = "pred.pred")` was called on `object` with a non-`NULL` `newdata` element. + # spmodel 0.10.0 -## Minor updates +## Minor Updates * Added a robust semivariogram option to `esv()`; see the `robust`argument to `esv()` ([#28](https://github.com/USEPA/spmodel/issues/28)). * Added `"none"` and `"ie"` spatial covariance types (via `spcov_type` or `spcov_initial`) to `spautor()`, `spgautor()`, and `spautorRF()` ([#27](https://github.com/USEPA/spmodel/issues/27)). diff --git a/R/covmatrix.R b/R/covmatrix.R index efe25a2..866f855 100644 --- a/R/covmatrix.R +++ b/R/covmatrix.R @@ -65,7 +65,7 @@ covmatrix.splm <- function(object, newdata, cov_type, ...) { if (cov_type == "pred.pred") { if (inherits(object$newdata, "sf")) { - object$obdata <- sf_to_df(newdata) + object$obdata <- sf_to_df(object$newdata) } else { object$obdata <- newdata } diff --git a/R/data.R b/R/data.R index 5ee5143..1566e1a 100644 --- a/R/data.R +++ b/R/data.R @@ -211,3 +211,23 @@ #' @source #' The data source is the \code{elect80} data set in the \code{spData} R package. "texas" + + +#' Four Corners State Borders +#' +#' @description State borders for the four corners states in the United States: +#' Arizona, Colorado, New Mexico, Utah. +#' +#' @format An \code{sf} object with 4 rows and 4 columns: +#' +#' \itemize{ +#' \item NAME: State name. +#' \item STUSPS: State postal code. +#' \item GEO.ID: State GEO.ID from the United States Census Bureau. +#' \item geometry: \code{POINT} geometry representing coordinates in a NAD83 +#' projection (EPSG: 5070). Distances between points are in meters. +#' } +#' @source +#' The data source is the United States Census Bureau TIGER/Line Shapefiles. +"fc_borders" +# start with us_borders <- tigris::states(cb = TRUE, year = 2024) diff --git a/R/dispersion_optim2orig.R b/R/dispersion_optim2orig.R index e7541ea..9e2c084 100644 --- a/R/dispersion_optim2orig.R +++ b/R/dispersion_optim2orig.R @@ -10,6 +10,9 @@ dispersion_optim2orig <- function(dispersion_orig2optim, par) { } else { fill_optim_par_val <- par[length(par)] # dispersion is the last element fill_orig_val <- exp(fill_optim_par_val) + # cap lower and upper dispersion values for numeric stability + fill_orig_val <- pmax(1e-8, fill_orig_val) + fill_orig_val <- pmin(1e8, fill_orig_val) names(fill_orig_val) <- "dispersion" new_par <- par[-length(par)] # remove dispersion parameter } diff --git a/R/dispersion_orig2optim.R b/R/dispersion_orig2optim.R index bae76c0..f22b005 100644 --- a/R/dispersion_orig2optim.R +++ b/R/dispersion_orig2optim.R @@ -5,6 +5,7 @@ dispersion_orig2optim <- function(dispersion_initial) { names(dispersion_orig2optim_is_known) <- "dispersion_log" # return dispersion parameter vector + # can consider lower limit like -10, 10 for numerical stability dispersion_orig2optim_val <- ifelse(dispersion_orig2optim_val > 50 & !dispersion_orig2optim_is_known, 50, dispersion_orig2optim_val) dispersion_orig2optim_val <- ifelse(dispersion_orig2optim_val < -50 & !dispersion_orig2optim_is_known, -50, dispersion_orig2optim_val) diff --git a/R/get_local_list.R b/R/get_local_list.R index 0794922..052e83b 100644 --- a/R/get_local_list.R +++ b/R/get_local_list.R @@ -156,3 +156,44 @@ get_local_list_prediction <- function(local) { local } + + + + +get_local_list_prediction_block <- function(local) { + + # set local neighborhood size + # method can be "all" (for all data), "distance" (for local distance neighborhoods) + # or "covariance" (for local covariance neighborhoods) + + if (is.logical(local)) { + if (local) { + local <- list(method = "covariance", size = 1000) + } else { + local <- list(method = "all") + } + } + + names_local <- names(local) + + # errors + if ("method" %in% names_local) { + if (!local$method %in% c("all", "covariance", "distance")) { + stop("Invalid local method. Local method must be \"all\", \"covariance\", or \"distance\".", call. = FALSE) + } + } + + + if (!"method" %in% names_local) { + # local$method <- "all" + local$method <- "covariance" + } + + if (local$method %in% c("distance", "covariance") && !"size" %in% names_local) { + local$size <- 1000 + } + + # no parallel required + + local +} diff --git a/R/laploglik_products.R b/R/laploglik_products.R index ba23289..8400b6e 100644 --- a/R/laploglik_products.R +++ b/R/laploglik_products.R @@ -229,6 +229,7 @@ get_w_and_H_spglm <- function(data_object, dispersion, SigInv_list, SigInv_X, co # Next, compute H D <- get_D(family, w, y, size, dispersion) H <- D - Ptheta # not PD but -H is + # can consider changing tol here for numeric stability solveHg <- solve(H, g) wnew <- w - solveHg # mH_upchol <- chol(Matrix::forceSymmetric(-H)) @@ -342,6 +343,7 @@ get_w_and_H_spgautor <- function(data_object, dispersion, SigInv, SigInv_X, cov_ # Next, compute H D <- get_D(family, w, y, size, dispersion) H <- D - Ptheta # not PD but -H is + # can consider changing tol here for numeric stability solveHg <- solve(H, g) wnew <- w - solveHg # mH_upchol <- chol(Matrix::forceSymmetric(-H)) diff --git a/R/predict.R b/R/predict.R index 20ad711..d011a0c 100644 --- a/R/predict.R +++ b/R/predict.R @@ -69,10 +69,25 @@ #' If \code{local} is \code{TRUE}, defaults for \code{local} are chosen such #' that \code{local} is transformed into #' \code{list(size = 100, method = "covariance", parallel = FALSE)}. +#' +#' If \code{block} is \code{TRUE}, \code{local} accepts \code{method} and \code{size}, +#' and \code{method} takes values of \code{"all"}, \code{"covariance"}, +#' and \code{"distance"}, similar as when \code{block} is \code{FALSE}. +#' The default \code{method} is \code{"distance"} with size \code{1000}. This default \code{size} is +#' much larger than when \code{block} is \code{FALSE}. This is because when \code{block} is \code{TRUE}, +#' covariances and explanatory variables are averaged before prediction, which greatly +#' reduces computational burden, only requiring the Cholesky decomposition +#' of one observed covariance matrix. Because the computational burden is reduced dramatically when \code{block} is \code{TRUE}, +#' parallel processing is not needed and hence, \code{parallel} and \code{ncores} are ignored if specified in \code{local}. #' @param terms If \code{type} is \code{"terms"}, the type of terms to be returned, #' specified via either numeric position or name. The default is all terms are included. #' @param na.action Missing (\code{NA}) values in \code{newdata} will return an error and should #' be removed before proceeding. +#' @param block A logical indicating whether a block prediction over the entire region +#' in \code{newdata} should be returned. When \code{block} is \code{TRUE}, +#' \code{newdata} should be a dense grid of prediction locations that span +#' the entire region. The default is \code{FALSE}, which +#' returns point predictions for each location on \code{newdata}. #' @param ... Other arguments. Only used for models fit using \code{splmRF()} #' or \code{spautorRF()} where \code{...} indicates other #' arguments to \code{ranger::predict.ranger()}. @@ -122,7 +137,8 @@ #' predict(spmod, sulfate_preds, interval = "prediction") #' augment(spmod, newdata = sulfate_preds, interval = "prediction") predict.splm <- function(object, newdata, se.fit = FALSE, scale = NULL, df = Inf, interval = c("none", "confidence", "prediction"), - level = 0.95, type = c("response", "terms"), local, terms = NULL, na.action = na.fail, ...) { + level = 0.95, type = c("response", "terms"), block = FALSE, local, terms = NULL, na.action = na.fail, ...) { + # match interval argument so the three display interval <- match.arg(interval) @@ -133,6 +149,11 @@ predict.splm <- function(object, newdata, se.fit = FALSE, scale = NULL, df = Inf local <- NULL } + if (block) { + object <- predict_block_splm(object, newdata, se.fit, scale, df, interval, level, type, local, terms, na.action, ...) + return(object) + } + # check scale is numeric (if specified) if (!is.null(scale) && !is.numeric(scale)) { stop("scale must be numeric.", call. = FALSE) diff --git a/R/predict_block.R b/R/predict_block.R new file mode 100644 index 0000000..bf458f4 --- /dev/null +++ b/R/predict_block.R @@ -0,0 +1,236 @@ +predict_block_splm <- function(object, newdata, se.fit, scale, df, interval, level, type, local, terms, na.action, ...) { + + + + # deal with local + if (missing(local)) { + local <- NULL + } + + # check scale is numeric (if specified) + if (!is.null(scale) && !is.numeric(scale)) { + stop("scale must be numeric.", call. = FALSE) + } + + # handle na action -- this is an inefficient workaround that should be fixed later + # placeholder as a reminder to consider adding na.action argument at a later date + # na_action <- as.character(substitute(na.action)) + + # error if newdata missing from arguments and object + if (missing(newdata) && is.null(object$newdata)) { + stop("No missing data to predict. newdata must be specified in the newdata argument or object$newdata must be non-NULL.", call. = FALSE) + } + + # rename relevant quantities + obdata <- object$obdata + xcoord <- object$xcoord + ycoord <- object$ycoord + + # write newdata if predicting missing data + if (missing(newdata)) { + add_newdata_rows <- TRUE + newdata <- object$newdata + } else { + add_newdata_rows <- FALSE + } + + # deal with local + if (is.null(local)) { + if (object$n > 10000) { + # if (object$n > 5000 || NROW(newdata) > 5000) { + local <- TRUE + message("Because the sample size of the fitted model object exceeds 10,000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun predict() with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.") + } else { + local <- FALSE + } + } + + # save spcov param vector + spcov_params_val <- coef(object, type = "spcov") + + # save randcov param vector + randcov_params_val <- coef(object, type = "randcov") + + + attr_sp <- attr(class(newdata), "package") + if (!is.null(attr_sp) && length(attr_sp) == 1 && attr_sp == "sp") { + stop("sf objects must be used instead of sp objects. To convert your sp object into an sf object, run sf::st_as_sf().", call. = FALSE) + } + + if (inherits(newdata, "sf")) { + newdata <- suppressWarnings(sf::st_centroid(newdata)) + + newdata <- sf_to_df(newdata) + names(newdata)[[which(names(newdata) == ".xcoord")]] <- as.character(xcoord) # only relevant if newdata is sf data is not + names(newdata)[[which(names(newdata) == ".ycoord")]] <- as.character(ycoord) # only relevant if newdata is sf data is not + } + + # add back in zero column to cover anisotropy (should make anisotropy only available 1-d) + if (object$dim_coords == 1) { + obdata[[ycoord]] <- 0 + newdata[[ycoord]] <- 0 + } + + if (object$anisotropy) { # could just do rotate != 0 || scale != 1 + obdata_aniscoords <- transform_anis(obdata, xcoord, ycoord, + rotate = spcov_params_val[["rotate"]], + scale = spcov_params_val[["scale"]] + ) + obdata[[xcoord]] <- obdata_aniscoords$xcoord_val + obdata[[ycoord]] <- obdata_aniscoords$ycoord_val + newdata_aniscoords <- transform_anis(newdata, xcoord, ycoord, + rotate = spcov_params_val[["rotate"]], + scale = spcov_params_val[["scale"]] + ) + newdata[[xcoord]] <- newdata_aniscoords$xcoord_val + newdata[[ycoord]] <- newdata_aniscoords$ycoord_val + } + + formula_newdata <- delete.response(terms(object)) + # fix model frame bug with degree 2 basic polynomial and one prediction row + # e.g. poly(x, y, degree = 2) and newdata has one row + if (any(grepl("nmatrix.", attributes(formula_newdata)$dataClasses, fixed = TRUE)) && NROW(newdata) == 1) { + newdata <- newdata[c(1, 1), , drop = FALSE] + newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels) + newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts) + newdata_model <- newdata_model[1, , drop = FALSE] + # find offset + offset <- model.offset(newdata_model_frame) + if (!is.null(offset)) { + offset <- offset[1] + } + newdata <- newdata[1, , drop = FALSE] + } else { + newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels) + # assumes that predicted observations are not outside the factor levels + newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts) + # find offset + offset <- model.offset(newdata_model_frame) + } + attr_assign <- attr(newdata_model, "assign") + attr_contrasts <- attr(newdata_model, "contrasts") + keep_cols <- which(colnames(newdata_model) %in% colnames(model.matrix(object))) + newdata_model <- newdata_model[, keep_cols, drop = FALSE] + attr(newdata_model, "assign") <- attr_assign[keep_cols] + attr(newdata_model, "contrasts") <- attr_contrasts + + # finding rows w/out NA + ob_predictors <- complete.cases(newdata_model) + if (any(!ob_predictors)) { + stop("Cannot have NA values in predictors.", call. = FALSE) + } + + newdata_model_attr <- attributes(newdata_model) + newdata_model <- matrix(colMeans(newdata_model), nrow = 1) # gives matrix, array class + attr(newdata_model, "assign") <- newdata_model_attr$assign + rownames(newdata_model) <- "1" + colnames(newdata_model) <- newdata_model_attr$dimnames[[2]] + x0 <- newdata_model + betahat <- coef(object) + cov_betahat <- vcov(object) + y <- model.response(model.frame(object)) + offset <- model.offset(model.frame(object)) + # call terms if needed + if (type == "terms") { + return(predict_terms(object, newdata_model, se.fit, scale, df, interval, level, add_newdata_rows, terms, ...)) + } + + + if (interval %in% c("none", "prediction")) { + + # local prediction list + local <- get_local_list_prediction_block(local) + + c0 <- colMeans(covmatrix(object, newdata = newdata, cov_type = "pred.obs")) + Sig <- covmatrix(object) + s0 <- mean(covmatrix(object, newdata = newdata, cov_type = "pred.pred")) + Xmat <- model.matrix(object) + + if (local$method == "all") { + cov_lowchol <- t(Matrix::chol(Matrix::forceSymmetric(Sig))) + } else { + n <- length(c0) + if (local$method == "distance") { + dist_vector <- spdist_vectors(newdata, obdata, xcoord, ycoord, object$dim_coords) + dist_vector <- colMeans(dist_vector) + index <- order(as.numeric(dist_vector))[seq(from = 1, to = min(n, local$size))] + # index <- order(as.numeric(c0))[seq(from = n, to = max(1, n - local$size + 1))] # covariance method for now + } else if (local$method == "covariance") { + index <- order(as.numeric(c0))[seq(from = n, to = max(1, n - local$size + 1))] + } + obdata <- obdata[index, , drop = FALSE] + c0 <- c0[index] + Xmat <- Xmat[index, , drop = FALSE] + y <- y[index] + if (!is.null(offset)) { + offset <- offset[index] + y <- y - offset + } + cov_lowchol <- t(Matrix::chol(Matrix::forceSymmetric(Sig[index, index, drop = FALSE]))) + } + + SqrtSigInv_X <- forwardsolve(cov_lowchol, Xmat) + SqrtSigInv_y <- forwardsolve(cov_lowchol, y) + residuals_pearson <- SqrtSigInv_y - SqrtSigInv_X %*% betahat + SqrtSigInv_c0 <- forwardsolve(cov_lowchol, c0) + + fit <- as.numeric(x0 %*% betahat + Matrix::crossprod(SqrtSigInv_c0, residuals_pearson)) + if (!is.null(offset)) { + fit <- fit + offset + } + + if (se.fit || interval == "prediction") { + H <- x0 - crossprod(SqrtSigInv_c0, SqrtSigInv_X) + vars <- as.numeric(s0 - crossprod(SqrtSigInv_c0, SqrtSigInv_c0) + H %*% tcrossprod(cov_betahat, H)) + se <- sqrt(vars) + if (!is.null(scale)) { + se <- se * scale + df <- df + } else { + df <- Inf + } + if (interval == "prediction") { + tstar <- qt(1 - (1 - level) / 2, df = df) + lwr <- fit - tstar * se + upr <- fit + tstar * se + fit <- cbind(fit, lwr, upr) + row.names(fit) <- "1" + } + if (se.fit) { + return(list(fit = fit, se.fit = se)) + } else { + return(fit) + } + } else { + return(fit) + } + } else if (interval == "confidence") { + # finding fitted values of the mean parameters + fit <- as.numeric(x0 %*% coef(object)) + # apply offset + if (!is.null(offset)) { + fit <- fit + offset + } + vars <- as.numeric(tcrossprod(x0 %*% cov_betahat, x0)) # different from + # predict because x0 is a matrix here, not a vector + se <- sqrt(vars) + if (!is.null(scale)) { + se <- se * scale + df <- df + } else { + df <- Inf + } + tstar <- qt(1 - (1 - level) / 2, df = df) + lwr <- fit - tstar * se + upr <- fit + tstar * se + fit <- cbind(fit, lwr, upr) + row.names(fit) <- "1" + if (se.fit) { + return(list(fit = fit, se.fit = se)) + } else { + return(fit) + } + } else { + stop("Interval must be none, confidence, or prediction") + } +} diff --git a/R/use_gloglik_iid.R b/R/use_gloglik_iid.R index fafa55b..4a5da32 100644 --- a/R/use_gloglik_iid.R +++ b/R/use_gloglik_iid.R @@ -22,12 +22,12 @@ use_gloglik_iid <- function(spcov_initial, estmethod, data_object, dist_matrix_l lmod <- lm(data_object$formula, data = data_object$obdata) sse <- sum(residuals(lmod)^2) - RX <- crossprod(X, X) + Xt_X <- crossprod(X, X) l1 <- 0 # sum of the logs of the identity (all ones) l2 <- sse # l3 <- 2 * sum(log(diag(abs(R)))) - l3 <- sum(log(diag(abs(RX)))) + l3 <- 2 * sum(log(diag(chol(Xt_X)))) if (estmethod == "reml") { minustwologlik <- as.numeric(l1 + (data_object$n - data_object$p) * log(l2) + l3 + (data_object$n - data_object$p) * (1 + log(2 * pi / (data_object$n - data_object$p)))) diff --git a/_pkgdown.yml b/_pkgdown.yml index 8881dd9..958f0a1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,6 +10,7 @@ articles: - articles/guide - articles/SPGLMs - articles/emmeans + - articles/block - articles/technical # toc broken in bootstrap 5, pkgdown 2.1.0 diff --git a/data/fc_borders.rda b/data/fc_borders.rda new file mode 100644 index 0000000..75ea8f9 Binary files /dev/null and b/data/fc_borders.rda differ diff --git a/docs/404.html b/docs/404.html index 4bd27a9..a689570 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ @@ -67,6 +67,9 @@
vignettes/articles/block.Rmd
+ block.Rmdspmodel is a package used to fit, summarize, and predict
+for a variety of spatial statistical models. This vignette explores
+tools for predicting the average value of a response variable in a
+particular geographic region, an approach known as Block Prediction
+(i.e., Block Kriging) (Cressie 1993; Ver Hoef
+2008). Before proceeding, we load spmodel,
+sf, and ggplot2 by running
If using spmodel in a formal publication or report,
+please cite it. Citing spmodel lets us devote more
+resources to the package in the future. We view the spmodel
+citation by running
+citation(package = "spmodel")#> To cite spmodel in publications use:
+#>
+#> Dumelle M, Higham M, Ver Hoef JM (2023). spmodel: Spatial statistical
+#> modeling and prediction in R. PLOS ONE 18(3): e0282524.
+#> https://doi.org/10.1371/journal.pone.0282524
+#>
+#> A BibTeX entry for LaTeX users is
+#>
+#> @Article{,
+#> title = {{spmodel}: Spatial statistical modeling and prediction in {R}},
+#> author = {Michael Dumelle and Matt Higham and Jay M. {Ver Hoef}},
+#> journal = {PLOS ONE},
+#> year = {2023},
+#> volume = {18},
+#> number = {3},
+#> pages = {1--32},
+#> doi = {10.1371/journal.pone.0282524},
+#> url = {https://doi.org/10.1371/journal.pone.0282524},
+#> }
+spmodel
+The sulfate data is an sf object that
+contains sulfate measurements in the conterminous United States (CONUS).
+We first visualize the distribution of the sulfate data by running
+ggplot(sulfate, aes(color = sulfate)) +
+ geom_sf(size = 2.5) +
+ scale_color_viridis_c() +
+ theme_gray(base_size = 18)
In the “Detailed Guide” vignette, we describe how to fit a spatial
+linear model to the sulfate data and use it to predict sulfate
+concentrations at the geographic locations in
+sulfate_preds. But what if we want to predict the average
+sulfate concentration in the entire CONUS? Or what if we want to predict
+the average sulfate concentration in a single state like Colorado? Or a
+region like the Four Corners region of CONUS? We can answer these
+questions using Block Prediction (i.e., Block Kriging).
First, we fit a spatial linear model for sulfate using an +intercept-only model with an exponential spatial covariance function by +running
+
+sulfmod <- splm(sulfate ~ 1, sulfate, spcov_type = "exponential")We can use this model to perform Block Prediction in any subregion of
+CONUS. Consider the four corners region of CONUS, which is composed of
+four states: Arizona (AZ), Colorado (CO), New Mexico (NM), and Utah
+(UT). fc_borders is a simple features object with polygonal
+geometry that contains the boundaries of these four states. We can
+visualize the states in this region by running
+ggplot(data = fc_borders, aes(fill = STUSPS, label = STUSPS)) +
+ geom_sf() +
+ geom_sf_text()
We subset the sulfate data to the observations from the Four Corners
+region so that we can zoom in on their values. We implement a slight
+jitter (random perturbation) of the coordinates to avoid overplotting
+points that are very close to one another. Moreover, we use a different
+color scale than for the full sulfate data so we can better
+distinguish among the sulfate concentrations in the region:
+sulfate_fc <- sulfate[fc_borders, ]
+sulfate_fc <- st_jitter(sulfate_fc, factor = 0.04)
+ggplot() +
+ geom_sf(data = fc_borders, fill = "transparent") +
+ geom_sf(data = sulfate_fc, aes(color = sulfate), size = 2) +
+ scale_color_viridis_c(option = "B", limits = c(0, 8)) +
+ theme_bw()
The sulfate concentrations in the western part of the Four Corners +region (UT, AZ) appear lower than in the eastern part (CO, NM).
+Recall that our goal is to predict the mean sulfate concentration in
+the entire region. Block Prediction works (in practice) by first
+generating a dense grid of prediction locations that cover the entire
+geographic region of interest (here, the Four Corners with boundaries
+given in fc_borders). We create this grid and turn the
+dense grid (of 5,000 points) into an sf object by
+running:
+grid_size <- 5000
+fc_grid <- st_sample(fc_borders, grid_size, type = "hexagonal")
+fc_grid <- st_as_sf(fc_grid)Depending on the size of the spatial domain, consider increasing or +decreasing the grid size. The larger the grid size, the more accurate +the Block Prediction, though the returns diminish rapidly and add +additional computational burden.
+We visualize the spatial locations (small, black circles) on the grid +with the observed sulfate concentrations overlain:
+
+ggplot() +
+ geom_sf(data = fc_borders, fill = "transparent") +
+ geom_sf(data = fc_grid, size = 0.2) +
+ geom_sf(data = sulfate_fc, aes(color = sulfate), size = 4) +
+ scale_color_viridis_c(option = "B", limits = c(0, 8)) +
+ theme_bw()
Intuitively, Block Prediction works by making point predictions at
+each point in the dense grid and then combining each prediction in a way
+that respects the uncertainties associated with the fitted spatial
+model. We obtain predictions at each point in the dense grid using the
+sulfmod model that was fit to all of the sulfate data in
+the conterminous United States by running
+fc_grid$preds <- predict(sulfmod, newdata = fc_grid)We visualize these point predictions by running
+
+ggplot() +
+ geom_sf(data = fc_borders, fill = "transparent") +
+ geom_sf(data = fc_grid, aes(color = preds), size = 0.2) +
+ geom_sf(data = sulfate_fc, aes(color = sulfate), size = 4) +
+ scale_color_viridis_c(option = "B", limits = c(0, 8)) +
+ theme_bw()
We can combine the point predictions via Block Prediction by adding a
+block = TRUE argument to predict():
+predict(sulfmod, newdata = fc_grid, block = TRUE)#> [1] 3.487391
+Just like with predict.splm(), we can obtain appropriate
+prediction uncertainty using the interval argument:
+predict(sulfmod, newdata = fc_grid, block = TRUE, interval = "prediction")#> fit lwr upr
+#> 1 3.487391 2.062777 4.912005
+Now suppose that we wanted to predict average sulfate concentrations +in Colorado. First, we subset the Four Corners region to Colorado:
+
+co_borders <- fc_borders[fc_borders$STUSPS == "CO", ]Then, we subset the dense grid to Colordao:
+
+co_grid <- fc_grid[co_borders, ]Finally, we use Block Prediction:
+
+predict(sulfmod, co_grid, block = TRUE, interval = "prediction")#> fit lwr upr
+#> 1 3.954228 2.204428 5.704028
+Alternatively, one could create a new grid for Colordao rather than +subsetting the Four Corners grid.
+We write a helper function that returns the Block Prediction mean +sulfate concentration for each of the four states (by subsetting the +original grid):
+
+return_state_mean <- function(state) {
+ state_borders <- fc_borders[fc_borders$STUSPS == state, ]
+ state_grid <- fc_grid[state_borders, ]
+ predict(sulfmod, state_grid, block = TRUE)
+}
+fc_borders$mean <- vapply(fc_borders$STUSPS, return_state_mean, numeric(1))We can visualize each Block Prediction of mean sulfate +concentration:
+
+ggplot() +
+ geom_sf(data = fc_borders, aes(fill = mean)) +
+ scale_fill_viridis_c(option = "B", limits = c(0, 8)) +
+ theme_bw()
A few details:
+If the spatial model has explanatory variables, each element in +the dense Block Prediction grid must have values of those explanatory +variables. One way to accomplish this, for example, is to leverage a +raster whose layers contain relevant explanatory variables.
There is a nuanced difference between Block Prediction and the +fixed effect coefficients estimated by the model. Block Prediction +predicts the realized mean, while the fixed effect coefficients are +estimates of the underlying process mean. For more, see Dumelle et al. (2022).
Block Prediction is only available for point-referenced spatial
+linear models (i.e., models fit via splm()). Block
+Prediction is not available for areal data sets (e.g.,
+spgautor()) or spatial generalized linear models (e.g.,
+spglm(), spgautor()). Like
+predict.splm(), predict.splm(block = TRUE) has
+a local argument for large data sets. See
+?predict.splm() for more.
lake by state and year
by running
-ggplot(lake, aes(color = log_cond)) +
- geom_sf() +
- scale_color_viridis_c() +
- theme_gray(base_size = 14)ggplot(lake, aes(color = log_cond)) +
+ geom_sf() +
+ scale_color_viridis_c() +
+ theme_gray(base_size = 14)

Distribution of log conductivity in the lake data. @@ -621,7 +624,7 @@
#> state year temp.trend SE df asymp.LCL asymp.UCL
#> AZ 2012 0.145 0.0358 Inf 0.0748 0.215
#> CO 2012 0.269 0.0482 Inf 0.1747 0.364
-#> NV 2012 0.168 0.1030 Inf -0.0338 0.369
+#> NV 2012 0.168 0.1029 Inf -0.0338 0.369
#> UT 2012 0.325 0.0390 Inf 0.2486 0.401
#> AZ 2017 0.225 0.0656 Inf 0.0962 0.354
#> CO 2017 0.293 0.0704 Inf 0.1546 0.431
#> NV 2017 0.178 0.0562 Inf 0.0682 0.288
-#> UT 2017 0.362 0.2440 Inf -0.1159 0.840
+#> UT 2017 0.362 0.2438 Inf -0.1159 0.840
#>
#> Degrees-of-freedom method: asymptotic
#> Confidence level used: 0.95
diff --git a/docs/articles/guide.html b/docs/articles/guide.html
index b1264aa..11790f4 100644
--- a/docs/articles/guide.html
+++ b/docs/articles/guide.html
@@ -40,7 +40,7 @@
#> df AIC
#> spmod 3 373.2089
-#> lmod 1 636.0635
+#> lmod 1 633.6418
AICc(spmod, lmod)#> df AICc
#> spmod 3 373.2754
-#> lmod 1 636.0745
+#> lmod 1 633.6528
The noticeably lower AIC and AICc of the spatial model indicate that it is a better fit to the data than the non-spatial model. Recall that these AIC and AICc comparisons are valid because both models are fit @@ -704,7 +707,7 @@
#> model n p npar value AIC AICc BIC logLik deviance
#> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 spmod 365 2 3 367. 373. 373. 385. -184. 363.
-#> 2 lmod 365 2 1 634. 636. 636. 640. -317. 363.
+#> 2 lmod 365 2 1 632. 634. 634. 638. -316. 363.
#> # ℹ 1 more variable: pseudo.r.squared <dbl>
Finally, the augment() function augments the original
data with model diagnostics:
The much lower AIC and AICc for the spatial linear model indicates it
is a much better fit to the data. Outside of glance() and
diff --git a/docs/articles/technical.html b/docs/articles/technical.html
index 4e563da..5eb4914 100644
--- a/docs/articles/technical.html
+++ b/docs/articles/technical.html
@@ -40,7 +40,7 @@
@@ -68,6 +68,9 @@
Rather than making predictions at point-referenced locations, Block
+Prediction (i.e., Block Kriging) is a technique used to predict an
+average (or total) in some region (i.e., spatial domain). When
+interval = "none" or interval = "prediction",
+the (empirical) Block Prediction (BP) is given by \[\begin{equation}\label{eq:bp_pred}
+ \mathbf{\dot{y}}_B = \mathbf{X}_{B} \hat{\boldsymbol{\beta}} +
+\hat{\boldsymbol{\Sigma}}_{B} \hat{\boldsymbol{\Sigma}}^{-1}_{o}
+(\mathbf{y}_o - \mathbf{X}_o \hat{\boldsymbol{\beta}}).
+\end{equation}\] The quantity \(\mathbf{X}_{B} = [\mathbf{x}_{1, B},
+\mathbf{x}_{2, B}, ... , \mathbf{x}_{k, B}]^\top\) where \(j = 1, 2, ..., k\) indexes the columns of
+\(\mathbf{X}_{B}\) and \(\mathbf{x}_{j, B} = \frac{1}{|B|}\int_B
+\mathbf{x}_j d\mathbf{s}\) for the \(\mathbf{s}\) points in the region. The
+total area (volume) of the region is \(|B|\) . The quantity \(\hat{\boldsymbol{\Sigma}}_{B} =
+[\hat{\boldsymbol{\Sigma}}_1, \hat{\boldsymbol{\Sigma}}_2, ... ,
+\hat{\boldsymbol{\Sigma}}_n]^\top\) where \(i = 1, 2, ... , n\) indexes each element in
+\(\mathbf{y}_o\) and \(\hat{\boldsymbol{\Sigma}}_i = \frac{1}{|B|}\int_B
+\text{Cov}(\mathbf{y}_B, \text{y}_i) d\mathbf{s}\). The quantity
+\(\text{Cov}(\mathbf{y}_B,
+\text{y}_i)\) represents the covariance between \(\text{y}_i\) and all other points in the
+region. In practice, the Block Prediction integrals are approximated
+using summation on a fine grid of \(G\)
+points, similar to other numerical integration techniques. That is,
+\(\mathbf{x}_{j, B} \approx
+\frac{1}{|B|}\sum_{g = 1}^G \mathbf{x}_g\) and similarly for
+\(\hat{\boldsymbol{\Sigma}}_i\), where
+\(g\) indexes the points on the fine
+grid. Intuitively, these summations approximate average values in the
+entire region.
When interval = "prediction", the (100 \(\times\) level)% prediction
+interval for \(\mathbf{\dot{y}}_B\) is
+\(\mathbf{\dot{y}}_B \pm z^*
+\sqrt{\sigma^2_B}\), where \(\sigma^2_B
+= \sigma^{2*}_B - \hat{\boldsymbol{\Sigma}}_{B}
+\hat{\boldsymbol{\Sigma}}^{-1}_{o}\hat{\boldsymbol{\Sigma}}_{B}^\top +
+\mathbf{Q}_B (\mathbf{X}_o^\top \hat{\boldsymbol{\Sigma}}_o^{-1}
+\mathbf{X}_o)^{-1} \mathbf{Q}_B^\top\). The quantity \(\sigma^{2*}_B = \frac{1}{|B|^2}\int_B \int_B
+\text{Cov}(\text{y}_\mathbf{s}, \text{y}_\mathbf{u})
+d\mathbf{s}d\mathbf{u}\), where \(\mathbf{s}\) and \(\mathbf{u}\) represent points in the region
+(the product of \(\mathbf{s}\) and
+\(\mathbf{u}\) contains all possible
+pairs of points in the region). Intuitively, \(\sigma^{2*}_B\) approximates the average
+covariance between any two points in the region though summation over
+the fine grid. That is, \(\sigma^{2*}_B
+\approx \frac{1}{|B|^2}\sum_{g_i = 1}^G \sum_{g_j = 1}^G
+\text{Cov}(\text{y}_{g_i}, \text{y}_{g_j})\). The quantity \(\mathbf{Q}_B = \mathbf{X}_B -
+\hat{\boldsymbol{\Sigma}}_B \hat{\boldsymbol{\Sigma}}^{-1}_o
+\mathbf{X}_o\).
When interval = "confidence", the average process mean
+(i.e., not the realized mean) and uncertainties are returned from the
+underlying model. The (process) mean estimate is \(\mathbf{X}_{B} \hat{\boldsymbol{\beta}}\)
+and a (100 \(\times\)
+level)% confidence interval is \(\mathbf{X}_{B} \hat{\boldsymbol{\beta}} \pm z^*
+\sqrt{\mathbf{X}_{B} (\mathbf{X}^\top_o \hat{\boldsymbol{\Sigma}}_o^{-1}
+\mathbf{X}_o)^{-1} \mathbf{X}_{B}^\top}\).
For Big Data, when local = TRUE, the same approach is
+applied as for point prediction but adjusted slightly to accommodate the
+averaging necessary for Block Prediction. Thus, when
+method = "covariance" (the default), the size
+observations having the highest average covariance with elements of
+\(\mathbf{y}_o\) are used to find the
+subsets \(\check{\mathbf{X}}_o\), \(\check{\boldsymbol{\Sigma}}_o\), and \(\check{\mathbf{y}}_o\). When
+method = "distance", the size observations
+having the smallest average distance from elements of \(\mathbf{y}_o\) are used to find the subsets
+\(\check{\mathbf{X}}_o\), \(\check{\boldsymbol{\Sigma}}_o\), and \(\check{\mathbf{y}}_o\). Recall that these
+two methods are equivalent for processes without anisotropy, random
+effects, or partition factors, but can differ otherwise. The default is
+size = 1000, which is much larger than for point
+prediction. This is because for Block Prediction, the Cholesky
+decomposition of \(\check{\boldsymbol{\Sigma}}_o\) needs to
+only be computed once (rather than separately for each \(\check{\boldsymbol{\Sigma}}_o\) associated
+with each prediction location, as for point prediction).
Currently, the fine grid used to obtain Block Predictions is supplied
+by the user via newdata. For an overview of Block
+Prediction, see Cressie (1993). For
+applications to a finite population (i.e., a region with a finite number
+of point locations), see Ver Hoef (2008)
+and Dumelle et al. (2022).
splmRF() and spautorRF()
spmodel website titled “Block Prediction (i.e., Block Kriging) in spmodel”.fc_borders data set which contains borders for the Four Corners states in the United States.esv(); see the robustargument to esv() (#28)."none" and "ie" spatial covariance types (via spcov_type or spcov_initial) to spautor(), spgautor(), and spautorRF() (#27).State borders for the four corners states in the United States: +Arizona, Colorado, New Mexico, Utah.
+fc_bordersAn sf object with 4 rows and 4 columns:
NAME: State name.
STUSPS: State postal code.
GEO.ID: State GEO.ID from the United States Census Bureau.
geometry: POINT geometry representing coordinates in a NAD83
+projection (EPSG: 5070). Distances between points are in meters.
The data source is the United States Census Bureau TIGER/Line Shapefiles.
+esv() plot(<esv>)
Compute the empirical semivariogram
Four Corners State Borders
fitted(<splm>) fitted.values(<splm>) fitted(<spautor>) fitted.values(<spautor>) fitted(<spglm>) fitted.values(<spglm>) fitted(<spgautor>) fitted.values(<spgautor>)
spglm() or spgautor() model objects), or terms scale.
+A logical indicating whether a block prediction over the entire region
+in newdata should be returned. When block is TRUE,
+newdata should be a dense grid of prediction locations that span
+the entire region. The default is FALSE, which
+returns point predictions for each location on newdata.
A optional logical or list controlling the big data approximation. If omitted, local
is set to TRUE or FALSE based on the observed data sample size (i.e., sample size of the fitted
@@ -310,7 +322,16 @@
local is TRUE, defaults for local are chosen such
that local is transformed into
-list(size = 100, method = "covariance", parallel = FALSE).list(size = 100, method = "covariance", parallel = FALSE).
+If block is TRUE, local accepts method and size,
+and method takes values of "all", "covariance",
+and "distance", similar as when block is FALSE.
+The default method is "distance" with size 1000. This default size is
+much larger than when block is FALSE. This is because when block is TRUE,
+covariances and explanatory variables are averaged before prediction, which greatly
+reduces computational burden, only requiring the Cholesky decomposition
+of one observed covariance matrix. Because the computational burden is reduced dramatically when block is TRUE,
+parallel processing is not needed and hence, parallel and ncores are ignored if specified in local.