From 382678f81e0cbce7528617773ff5d4934e8c3976 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 09:59:31 -0500 Subject: [PATCH 1/3] Add more info about `bounds_table` --- DESCRIPTION | 1 + man/bounds_table.Rd | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 720c39c..78e5225 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Imports: BioCro (>= 3.2.0) Suggests: dfoptim, + DEoptim, lattice, knitr, rmarkdown, diff --git a/man/bounds_table.Rd b/man/bounds_table.Rd index b833c59..45cd68a 100644 --- a/man/bounds_table.Rd +++ b/man/bounds_table.Rd @@ -56,6 +56,10 @@ bounds; an error will be thrown if any do lie outside the bounds. A warning will also be thrown if any initial guesses lie on the bounds, since this can be problematic for some optimizers, such as \code{\link[dfoptim]{nmkb}}. + + Some optimizers, such as \code{\link[DEoptim]{DEoptim}}, do not require an + initial guess; in this case, there is no strong need to pass an initial guess + to \code{bounds_table}. } \value{ From 43a0e74baaf2a2dc2322842517554b94ac735c7e Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 09:59:57 -0500 Subject: [PATCH 2/3] Allow flexible `epsilon` --- R/objective_function.R | 6 +- R/objective_function_helpers.R | 32 +++++- man/objective_function.Rd | 114 ++++++++++++++++---- tests/testthat/test-objective_function.R | 1 + vignettes/parameterizing_soybean_biocro.Rmd | 28 +++-- 5 files changed, 139 insertions(+), 42 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 1730e29..6c21e00 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -8,7 +8,9 @@ objective_function <- function( quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + normalization_param = NULL, stdev_weight_method = 'equal', + stdev_weight_param = NULL, regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, @@ -73,13 +75,15 @@ objective_function <- function( long_form_data <- add_norm( long_form_data, normalization_method, + normalization_param, length(data_driver_pairs) ) # Add variance-based weights long_form_data <- add_w_var( long_form_data, - stdev_weight_method + stdev_weight_method, + stdev_weight_param ) # Print the long form data, if desired. Do this before checking the data, diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index ce32bef..ae5ef09 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -174,8 +174,23 @@ add_time_indices <- function(initial_runner_res, long_form_data) { long_form_data } +# Helping function for getting a parameter value that has a default +get_param_default <- function(param, default) { + if (is.null(param)) { + default + } else { + param + } +} + # Helping function for getting normalization factors -add_norm <- function(long_form_data, normalization_method, n_ddp) { +add_norm <- function( + long_form_data, + normalization_method, + normalization_param, + n_ddp +) +{ for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] @@ -191,10 +206,17 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { nrow(qname_subset) * n_ddp } else if (tolower(normalization_method) == 'max') { max(qname_subset[['quantity_value']])^2 + } else if (tolower(normalization_method) == 'obs') { + eps <- get_param_default(normalization_param, 1e-1) + data_table[i, 'quantity_value']^2 + eps } else if (tolower(normalization_method) == 'mean_max') { npts <- nrow(qname_subset) qmax <- max(qname_subset[['quantity_value']]) npts * n_ddp * qmax^2 + } else if (tolower(normalization_method) == 'mean_obs') { + npts <- nrow(qname_subset) + eps <- get_param_default(normalization_param, 1e-1) + npts * n_ddp * (data_table[i, 'quantity_value']^2 + eps) } else { stop('Unsupported normalization_method: ', normalization_method) } @@ -207,7 +229,7 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { } # Helping function for getting variance-based weights -add_w_var <- function(long_form_data, stdev_weight_method) { +add_w_var <- function(long_form_data, stdev_weight_method, stdev_weight_param) { for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] data_stdev <- data_table[['quantity_stdev']] @@ -216,9 +238,11 @@ add_w_var <- function(long_form_data, stdev_weight_method) { if (tolower(stdev_weight_method) == 'equal') { 1.0 } else if (tolower(stdev_weight_method) == 'logarithm') { - log(1 / (data_stdev + 1e-5)) + eps <- get_param_default(stdev_weight_param, 1e-5) + log(1 / (data_stdev + eps)) } else if (tolower(stdev_weight_method) == 'inverse') { - 1 / data_stdev^2 + eps <- get_param_default(stdev_weight_param, 1e-1) + 1 / (data_stdev^2 + eps) } else { stop('Unsupported stdev_weight_method: ', stdev_weight_method) } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 4693d9c..2cb1e5a 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -22,6 +22,9 @@ It is also possible to include "dependent" model arguments, whose values are determined from the "independent" model arguments that are varied during the parameterization procedure. + + For a detailed example of using \code{objective_function}, see the + "Parameterizing Soybean-BioCro" vignette/article. } \usage{ @@ -32,7 +35,9 @@ quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + normalization_param = NULL, stdev_weight_method = 'equal', + stdev_weight_param = NULL, regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, @@ -57,7 +62,8 @@ \code{time}, whose values follow BioCro's definition of \code{\link[BioCro]{time}}; the other columns should represent observed values of model outputs. Any \code{NA} values in \code{data} will - be ignored when calculating the error metric. + be ignored when calculating the error metric, but all non-\code{NA} values + of all columns (except \code{time}) will be compared to the model output. The \code{drivers} element must be a data frame that can be passed to \code{\link[BioCro]{run_biocro}} as its \code{drivers} input argument. @@ -75,8 +81,8 @@ \item{independent_args}{ A list of named numeric values. The names will determine the independent arguments to be varied during their optimization, and the values specify - "test" values of each argument that will be used internally to check that - the objective function is properly defined and can be evaluated. + "initial" or "test" values of each argument that will be used internally to + check that the objective function is properly defined and can be evaluated. } \item{quantity_weights}{ @@ -101,11 +107,25 @@ error metric; see below for more details. } + \item{normalization_param}{ + An (optional) parameter value used by some normalization methods. When + \code{normalization_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + \item{stdev_weight_method}{ A string indicating the method to be used when calculating the variance-based weights used in the error metric; see below for more details. } + \item{stdev_weight_param}{ + An (optional) parameter value used by some normalization methods. When + \code{stdev_weight_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + \item{regularization_method}{ A string indicating the regularization method to be used when calculating the regularization penalty term; see below for more details. @@ -327,18 +347,33 @@ as \deqn{w_i^{stdev} =% - ln \left( \frac{1}{\sigma + 10^{-5}} \right),} - where \eqn{ln} denotes a logarithm with base \eqn{e}. This method - was used in the original Soybean-BioCro paper. Note: this method - should be used with caution, because \eqn{w_i^{stdev}} is zero - for \eqn{\sigma = 1 - 10^{-5} = 0.99999}, and it becomes negative for - larger values of \eqn{\sigma}. + ln \left( \frac{1}{\sigma_i + \epsilon} \right),} + where \eqn{ln} denotes a logarithm with base \eqn{e} and + \eqn{\epsilon} is a small number included to prevent numerical errors + that would otherwise occur when \eqn{\sigma = 0}. This method was + used in the original Soybean-BioCro paper. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-5} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} \approx 11.512} when \eqn{\sigma = 0}. + + Note: this method should be used with caution, because + \eqn{w_i^{stdev}} is zero for \eqn{\sigma_i = 1 - \epsilon}, and it + becomes negative for larger values of \eqn{\sigma_i}. \item \code{'inverse'}: For this method, \eqn{w_i^{stdev}} is calculated as - \deqn{w_i^{stdev} = \frac{1}{\sigma^2}.} - Note: this method should be used with caution, because - \eqn{w_i^{stdev}} is infinite when \eqn{\sigma} is zero. + \deqn{w_i^{stdev} = \frac{1}{\sigma_i^2 + \epsilon},} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{\sigma_i = 0}. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-1} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} = 10} when \eqn{\sigma_i = 0}. } If any values of \eqn{w_i^{stdev}} are undefined, negative, or infinite, an @@ -377,6 +412,23 @@ method. This approach avoids over-representing variable types with larger magnitude when determining \eqn{E_{agreement}}. + \item \code{'obs'}: For this method, + + \deqn{N_i = \left( Y_{obs}^i \right)^2 + \epsilon,} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{Y_{obs}^i = 0}. In this + case, the equation for \eqn{E_{agreement}} essentially uses relative + differences rather than absolute differences, which is achieved by + normalizing by the observed values, hence the name. This approach + avoids over-representing time points where a particular quantity takes + its largest values when determining \eqn{E_{agreement}}. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, \eqn{N_i = 10} + when \eqn{Y_{obs}^i = 0}. + \item \code{'mean_max'}: For this method, the "mean" and "max" methods are combined so that @@ -384,10 +436,27 @@ \cdot n_{data} \cdot \left( max_{vtype}^{vdata} \right)^2.} This approach avoids over-representing drivers with larger numbers of associated observations, and variable types with larger magnitudes. - This method is used for parameterizing Soybean-BioCro, and is - recommended for most situations. + This method is used for parameterizing Soybean-BioCro. + + \item \code{'mean_obs'}: For this method, the "mean" and "obs" methods are + combined so that + + \deqn{N_i = n_{vtype}^{vdrivers} \cdot n_{data}% + \cdot \left( \left( Y_{obs}^i \right)^2 + \epsilon \right)^2.} + This approach avoids over-representing drivers with larger numbers of + associated observations, and time points with larger observed values. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{N_i = 10 \cdot n_{vtype}^{vdrivers} \cdot n_{data}} when + \eqn{Y_{obs}^i = 0}. } + In most situations, it is recommended to use either \code{'mean_max'} or + \code{'mean_obs'} depending on user preference or performance. + \strong{Regularization methods} The following regularization methods are available: @@ -437,11 +506,12 @@ \code{post_process_function}. \item Ensuring that each independent and dependent argument name is either - a parameter or initial value of the model. This is accomplished using - \code{\link[BioCro]{partial_run_biocro}}. Note that argument names - passed to \code{partial_run_biocro} can technically include drivers, - but it is unlikely that the value of a driver would be varied during - an optimization, so the argument names are not allowed to include + a parameter or initial value of the model. Internally, argument names + are passed to \code{\link[BioCro]{partial_run_biocro}} via its + \code{arg_names} input. Note that argument names passed to + \code{partial_run_biocro} can technically include drivers, but it is + unlikely that the value of a driver would be varied during an + optimization, so the argument names are not allowed to include columns in the drivers. \item Ensuring that the optional \code{dependent_arg_function}, @@ -449,9 +519,9 @@ functions can be run without causing errors. \item Ensuring that certain values are finite (such as \eqn{Y_{obs}}, - \eqn{\sigma}, \eqn{w_i^{stdev}}, and \eqn{N}), and that certain values - are not negative (such as \eqn{\sigma}, \eqn{w_i^{stdev}}, and - \eqn{N}). + \eqn{\sigma_i}, \eqn{w_i^{stdev}}, and \eqn{N_i}), and that certain + values are not negative (such as \eqn{\sigma_i}, \eqn{w_i^{stdev}}, + and \eqn{N_i}). } If any issues are detected, an informative error message will be sent. Note diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 6fee66a..dcfccf1 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -473,6 +473,7 @@ test_that('Bad data values and weights are detected', { independent_args, quantity_weights, stdev_weight_method = 'inverse', + stdev_weight_param = 0, data_definitions = data_definitions, post_process_function = post_process_function, verbose_startup = verbose_startup diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 3db1dea..651894a 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -100,10 +100,10 @@ added to the observed values. In previous parameterizations, a standard deviation for the root mass was not explicitly estimated; instead, the standard-deviation-based weight factor was -implicitly set to 1. Because the `'logarithm'` method was used, a weight factor -of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See -the documentation page (`?objective_function`) for more information about this -weighting method. +implicitly set to 1. Because the `'logarithm'` method with +\(\epsilon = 10^{-5}\) was used, a weight factor of 1 implies a standard +deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See the documentation page +(`?objective_function`) for more information about this weighting method. Finally, the data set includes some values that are not needed for the parameterization. This includes the leaf litter accumulated between each @@ -129,14 +129,12 @@ process_table <- function(data_table, type) { sqrt(data_table$Rep_Mg_per_ha^2 + data_table$Seed_Mg_per_ha^2) } - # Define new `Root_Mg_per_ha` column - if (type == 'biomass') { - # Initialize all values to NA - data_table$Root_Mg_per_ha <- NA + # Define new `Root_Mg_per_ha` column, which has just one non-NA value + row_to_use <- 5 # Choose row to use + data_table$Root_Mg_per_ha <- NA # Initialize all values to NA + if (type == 'biomass') { # Estimate a mass at one time point - row_to_use <- 5 - col_to_add <- c( 'Leaf_Mg_per_ha', 'Stem_Mg_per_ha', @@ -146,9 +144,8 @@ process_table <- function(data_table, type) { data_table[row_to_use, 'Root_Mg_per_ha'] <- 0.17 * sum(data_table[row_to_use, col_to_add]) } else { - # We can set a value for each time point; any time points where the root - # mass is NA will be ignored - data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + # Estimate standard deviation at one time point + data_table[row_to_use, 'Root_Mg_per_ha'] <- 1 / exp(1) - 1e-5 } # Remove columns by setting them to NULL @@ -397,8 +394,8 @@ more details to discuss: for more details. - Soybean-BioCro has always used the `'logarithm'` method for determining - weights from standard deviations; see Equation 17 of - @matthews_soybean_biocro_2021 for more details. + weights from standard deviations with \(\epsilon = 10^{-5}\); see Equation 17 + of @matthews_soybean_biocro_2021 for more details. - Soybean-BioCro has not used any regularization. @@ -416,6 +413,7 @@ obj_fun <- objective_function( data_definitions = data_definitions, normalization_method = 'mean_max', stdev_weight_method = 'logarithm', + stdev_weight_param = 1e-5, regularization_method = 'none', dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, From 4b3e3490cc9fa3aa4672a2824272dcaed613d951 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 15:46:09 -0500 Subject: [PATCH 3/3] Add "Getting Started" vignette --- NEWS.md | 7 ++- README.md | 9 ++++ vignettes/BioCroValidation.Rmd | 92 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 vignettes/BioCroValidation.Rmd diff --git a/NEWS.md b/NEWS.md index ab5e3fa..350c164 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,14 +38,17 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. - Added several new functions: `objective_function`, `update_model`, and - `bounds_table` + `bounds_table`. -- Added a vignette illustrating how to perform a model parameterization. +- Added two new vignettes: a "Getting Started" article (`BioCroValidation.Rmd`) + and a user guide illustrating how to perform a model parameterization + (`parameterizing_soybean_biocro.Rmd`). # Changes in BioCroValidation Version 0.1.0 - This is the first version of BioCroValidation. At this point, the package is in a state of rapid development, and not all changes will be described here. + - We are reserving version `1.0.0` for a more stable and complete future release; until then, major changes should only increase the minor version number. diff --git a/README.md b/README.md index 2b2c91c..f6eeee0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,15 @@ remotes::install_github('BioCro/BioCroValidation') Note that this method requires the `remotes` package, which can be installed from within R by typing `install.packages('remotes')`. +### Usage + +The best way to learn about using `BioCroValidation` is to visit the +[BioCroValidation website](https://biocro.github.io/BioCroValidation/index.html) +and click the "Get Started" link in the top menu bar. The website includes +documentation for all the functions and data sets included in the package, as +well as articles that describe its general features and several important use +cases. + ### License The `BioCroValidation` R package and its documentation are licensed under the diff --git a/vignettes/BioCroValidation.Rmd b/vignettes/BioCroValidation.Rmd new file mode 100644 index 0000000..4982053 --- /dev/null +++ b/vignettes/BioCroValidation.Rmd @@ -0,0 +1,92 @@ +--- +title: "Getting Started With BioCroValidation" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Getting Started With BioCroValidation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 6, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +**BioCroValidation** is an R package that provides a suite of tools for +validating BioCro crop growth models. + +Validation is a key part of using and developing BioCro models. The goal of this +package is to provide some convenient "helping" functions to aid with various +aspects of model validation, especially parameterization. + +The central tool in the package is the `objective_function` function. As its +name implies, it can be used to create an objective function that calculates the +value of an error metric value given a set of model parameter values. In turn, +the objective function itself can be passed to an optimizer. + +While it is possible for each BioCro user to write their own customized +objective function, creating one can be a very complex process because there are +many potential aspects to consider: + +- **Mathematical / Statistical Approach:** This refers to choices like "Do I + want to normalize the error terms by each observed value, or by the largest + observed value for each measured quantity?" or "Do I want to use L1 or L2 + regularization?" + +- **Implementation of Mathematical / Statistical Approach:** Once an approach + has been identified, code must be written to properly implement it. + +- **Error Checks:** A wide variety of strange conditions can occur during + parameterization, and the objective function must be ready to handle them. For + example, how should the objective function respond when a simulation does not + run to completion, or when an optimizer passes `NA` as a parameter value? + +- **Technical Details:** Parameterization can take a long time to perform, so it + is important for the objective function code to be as efficient and fast as + possible. + +The goal of `objective_function` is to allow users to make the key choices about +their mathematical approach using clear code statements like +`regularization_method = 'L2'`, while the implementation details, error checks, +and other technical details are handled internally. This will result in clear +scripts that are also more reliable. + +Besides `objective_function`, the package also includes a few other functions +with a similar goal of clarifying code and hiding implementation details, such +as `bounds_table` and `update_model`. + +# Installing BioCroValidation + +The easiest way to install `BioCroValidation` is to type the following from +within an R terminal: + +```{r, eval = FALSE} +remotes::install_github('biocro/BioCroValidation') +``` + +Note that this method requires the `remotes` package, which can be installed +from within R by typing `install.packages('remotes')`. + +# Learning More + +The `BioCroValidation` package includes extensive documentation. The best place +to start is the +[Parameterizing Soybean-BioCro](parameterizing_soybean_biocro.html) +article, which illustrates the full process of defining an objective function, +running an optimization, examining the results, and saving the new model +definition. + +Another key resource is the help page for `objective_function`, which can be +accessed online or by typing `?objective_function` in an R terminal. This +document explains all the available options for normalization approaches, +regularization approaches, and other aspects of defining an objective function.