From 9e99946e893c70725562fd36384e989f66b0358a Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 30 May 2025 15:24:47 -0500 Subject: [PATCH] Fix some typos --- NEWS.md | 3 + R/objective_function_helpers.R | 31 ++++---- man/objective_function.Rd | 137 +++++++++++++++++---------------- 3 files changed, 91 insertions(+), 80 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ae1ad6..d90cf8f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,6 +35,9 @@ be directly added to this file to describe the related changes. # UNRELEASED +- Fixed typos in the help page for `objective_function`, and in the `add_norm` + function (defined in `R/objective_function_helpers.R`) + # Changes in BioCroValidation Version 0.2.0 (2025-05-23) - Added 2002 and 2005 SoyFACE biomass and standard deviation data. diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index ae5ef09..bfbae4d 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -200,23 +200,25 @@ add_norm <- function( qname_subset <- data_table[data_table[['quantity_name']] == qname, ] + npts <- nrow(qname_subset) + qmax <- max(abs(qname_subset[['quantity_value']])) + qobs <- data_table[j, 'quantity_value'] + + eps_max <- get_param_default(normalization_param, 1e-1) + eps_obs <- get_param_default(normalization_param, 1e-1) + if (tolower(normalization_method) == 'equal') { 1.0 } else if (tolower(normalization_method) == 'mean') { - nrow(qname_subset) * n_ddp + npts * n_ddp } else if (tolower(normalization_method) == 'max') { - max(qname_subset[['quantity_value']])^2 + qmax^2 + eps_max } else if (tolower(normalization_method) == 'obs') { - eps <- get_param_default(normalization_param, 1e-1) - data_table[i, 'quantity_value']^2 + eps + qobs^2 + eps_obs } else if (tolower(normalization_method) == 'mean_max') { - npts <- nrow(qname_subset) - qmax <- max(qname_subset[['quantity_value']]) - npts * n_ddp * qmax^2 + npts * n_ddp * (qmax^2 + eps_max) } else if (tolower(normalization_method) == 'mean_obs') { - npts <- nrow(qname_subset) - eps <- get_param_default(normalization_param, 1e-1) - npts * n_ddp * (data_table[i, 'quantity_value']^2 + eps) + npts * n_ddp * (qobs^2 + eps_obs) } else { stop('Unsupported normalization_method: ', normalization_method) } @@ -234,15 +236,16 @@ add_w_var <- function(long_form_data, stdev_weight_method, stdev_weight_param) { data_table <- long_form_data[[i]] data_stdev <- data_table[['quantity_stdev']] + eps_log <- get_param_default(stdev_weight_param, 1e-5) + eps_inv <- get_param_default(stdev_weight_param, 1e-1) + data_table[['w_var']] <- if (tolower(stdev_weight_method) == 'equal') { 1.0 } else if (tolower(stdev_weight_method) == 'logarithm') { - eps <- get_param_default(stdev_weight_param, 1e-5) - log(1 / (data_stdev + eps)) + log(1.0 / (data_stdev + eps_log)) } else if (tolower(stdev_weight_method) == 'inverse') { - eps <- get_param_default(stdev_weight_param, 1e-1) - 1 / (data_stdev^2 + eps) + 1.0 / (data_stdev^2 + eps_inv) } else { stop('Unsupported stdev_weight_method: ', stdev_weight_method) } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 2cb1e5a..269fbf9 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -24,7 +24,7 @@ parameterization procedure. For a detailed example of using \code{objective_function}, see the - "Parameterizing Soybean-BioCro" vignette/article. + "Parameterizing Soybean-BioCro" vignette. } \usage{ @@ -81,8 +81,8 @@ \item{independent_args}{ A list of named numeric values. The names will determine the independent arguments to be varied during their optimization, and the values specify - "initial" or "test" values of each argument that will be used internally to - check that the objective function is properly defined and can be evaluated. + "test" values of each argument that will be used internally to check that + the objective function is properly defined and can be evaluated. } \item{quantity_weights}{ @@ -165,16 +165,17 @@ \strong{Overview} When parameterizing a BioCro model, the general idea is to vary a subset of - the model's parameters to achieve the best agreement with a set of observed - data. The degree of agreement is expressed as an "error metric," which - includes terms derived from the agreement between the modeled and observed - values, as well as (optional) penalty terms. A function that calculates an - error metric when given a set of parameter values is called an "objective - function." Defining an objective function suitable for BioCro parameterization - can be complicated, but the \code{objective_function} function helps to - simplify the process of creating such a function. It is designed to - accommodate the following scenarios, which often occur in the context of - BioCro model parameterization: + the model's \code{parameters} and/or \code{initial_values} to achieve the + best agreement with a set of observed data. The degree of agreement is + expressed as an "error metric," which includes terms derived from the + agreement between the modeled and observed values, as well as (optional) + penalty terms. A function that calculates an error metric when given a set of + argument values (\code{parameters} and/or \code{initial_values}) is called an + "objective function." Defining an objective function suitable for BioCro + parameterization can be complicated, but the \code{objective_function} + function helps to simplify the process of creating such a function. It is + designed to accommodate the following scenarios, which often occur in the + context of BioCro model parameterization: \itemize{ \item \strong{Multi-year or multi-location data:} Often the model needs to @@ -267,7 +268,7 @@ by retrieving the value of the \eqn{Y_i} variable at the closest time to \eqn{t_i} that is included in the model output. It is assumed that the model always outputs the same sequence of time values each time it - is run with a particular set of drivers, regardless of the input + is run with a particular set of drivers, regardless of the provided argument values. The quantity-based weight factors \eqn{w_i^{quantity}} are directly @@ -314,7 +315,7 @@ The function \eqn{f_{user}} must accept a single data frame as an input and return a single numeric value as its output, but has no other requirements. It is specified via the - \code{extra_penalty_function} argument. When + \code{extra_penalty_function}. When \code{extra_penalty_function} is \code{NULL}, \eqn{P_{user}} is zero. \item \strong{Regularization penalty term:} The regularization penalty term @@ -350,14 +351,14 @@ ln \left( \frac{1}{\sigma_i + \epsilon} \right),} where \eqn{ln} denotes a logarithm with base \eqn{e} and \eqn{\epsilon} is a small number included to prevent numerical errors - that would otherwise occur when \eqn{\sigma = 0}. This method was + that would otherwise occur when \eqn{\sigma_i = 0}. This method was used in the original Soybean-BioCro paper. - The value of \eqn{\epsilon} is specified by the - \code{stdev_weight_param} input argument, which defaults to - \code{1e-5} when \code{stdev_weight_param} is \code{NULL} when using - this method. With the default value of \eqn{\epsilon}, - \eqn{w_i^{stdev} \approx 11.512} when \eqn{\sigma = 0}. + The value of \eqn{\epsilon} is specified by \code{stdev_weight_param}, + which defaults to \code{1e-5} if \code{stdev_weight_param} is + \code{NULL} when using this method. With the default value of + \eqn{\epsilon}, \eqn{w_i^{stdev} \approx 11.512} when + \eqn{\sigma = 0}. Note: this method should be used with caution, because \eqn{w_i^{stdev}} is zero for \eqn{\sigma_i = 1 - \epsilon}, and it @@ -369,11 +370,10 @@ where \eqn{\epsilon} is a small number included to prevent numerical errors that would otherwise occur when \eqn{\sigma_i = 0}. - The value of \eqn{\epsilon} is specified by the - \code{stdev_weight_param} input argument, which defaults to - \code{1e-1} when \code{stdev_weight_param} is \code{NULL} when using - this method. With the default value of \eqn{\epsilon}, - \eqn{w_i^{stdev} = 10} when \eqn{\sigma_i = 0}. + The value of \eqn{\epsilon} is specified by \code{stdev_weight_param}, + which defaults to \code{1e-1} if \code{stdev_weight_param} is + \code{NULL} when using this method. With the default value of + \eqn{\epsilon}, \eqn{w_i^{stdev} = 10} when \eqn{\sigma_i = 0}. } If any values of \eqn{w_i^{stdev}} are undefined, negative, or infinite, an @@ -404,13 +404,21 @@ \item \code{'max'}: For this method, when \eqn{Y_i} is named \code{vtype} and the observation is from a set called \code{vdata}, then - \deqn{N_i = \left( max_{vtype}^{vdata} \right)^2,} - where \eqn{max_{vtype}^{vdata}} is the maximum observed value of - \code{vtype} across \code{vdata}. In this case, the observed and + \deqn{N_i = \left( max_{vtype}^{vdata} \right)^2 + \epsilon,} + where \eqn{max_{vtype}^{vdata}} is the maximum observed absolute value + of \code{vtype} across \code{vdata} and \eqn{\epsilon} is a small + number included to prevent numerical errors that would otherwise occur + when \eqn{max_{vtype}^{vdata} = 0}. In this case, the observed and modeled values that appear in the equation for \eqn{E_{agreement}} are - essentially normalized by their maximum value, hence the name for this - method. This approach avoids over-representing variable types with - larger magnitude when determining \eqn{E_{agreement}}. + essentially normalized by their maximum magnitude, hence the name for + this method. This approach avoids over-representing variable types + with larger magnitude when determining \eqn{E_{agreement}}. + + The value of \eqn{\epsilon} is specified by + \code{normalization_param}, which defaults to \code{1e-1} if + \code{normalization_param} is \code{NULL} when using this method. + With the default value of \eqn{\epsilon}, \eqn{N_i = 10} when + \eqn{max_{vtype}^{vdata} = 0}. \item \code{'obs'}: For this method, @@ -423,35 +431,34 @@ avoids over-representing time points where a particular quantity takes its largest values when determining \eqn{E_{agreement}}. - The value of \eqn{\epsilon} is specified by the - \code{normalization_param} input argument, which defaults to - \code{1e-1} when \code{normalization_param} is \code{NULL} when using - this method. With the default value of \eqn{\epsilon}, \eqn{N_i = 10} - when \eqn{Y_{obs}^i = 0}. + The value of \eqn{\epsilon} is specified by + \code{normalization_param}, which defaults to \code{1e-1} if + \code{normalization_param} is \code{NULL} when using this method. + With the default value of \eqn{\epsilon}, \eqn{N_i = 10} when + \eqn{Y_{obs}^i = 0}. \item \code{'mean_max'}: For this method, the "mean" and "max" methods are combined so that - \deqn{N_i = n_{vtype}^{vdrivers}% - \cdot n_{data} \cdot \left( max_{vtype}^{vdata} \right)^2.} + \deqn{N_i = n_{vtype}^{vdrivers} \cdot n_{data}% + \cdot \left[ \left( max_{vtype}^{vdata} \right)^2 + \epsilon \right].} This approach avoids over-representing drivers with larger numbers of associated observations, and variable types with larger magnitudes. This method is used for parameterizing Soybean-BioCro. + The value \eqn{\epsilon} is specified by \code{normalization_param}, + using the same default value as in the \code{'max'} method. + \item \code{'mean_obs'}: For this method, the "mean" and "obs" methods are combined so that \deqn{N_i = n_{vtype}^{vdrivers} \cdot n_{data}% - \cdot \left( \left( Y_{obs}^i \right)^2 + \epsilon \right)^2.} + \cdot \left[ \left( Y_{obs}^i \right)^2 + \epsilon \right].} This approach avoids over-representing drivers with larger numbers of associated observations, and time points with larger observed values. - The value of \eqn{\epsilon} is specified by the - \code{normalization_param} input argument, which defaults to - \code{1e-1} when \code{normalization_param} is \code{NULL} when using - this method. With the default value of \eqn{\epsilon}, - \eqn{N_i = 10 \cdot n_{vtype}^{vdrivers} \cdot n_{data}} when - \eqn{Y_{obs}^i = 0}. + The value \eqn{\epsilon} is specified by \code{normalization_param}, + using the same default value as in the \code{'obs'} method. } In most situations, it is recommended to use either \code{'mean_max'} or @@ -505,14 +512,14 @@ output, when accounting for the \code{data_definitions} and \code{post_process_function}. - \item Ensuring that each independent and dependent argument name is either - a parameter or initial value of the model. Internally, argument names - are passed to \code{\link[BioCro]{partial_run_biocro}} via its - \code{arg_names} input. Note that argument names passed to - \code{partial_run_biocro} can technically include drivers, but it is - unlikely that the value of a driver would be varied during an - optimization, so the argument names are not allowed to include - columns in the drivers. + \item Ensuring that each independent and dependent argument name is a member + of the model's \code{parameters} or \code{initial_values}. Internally, + argument names are passed to \code{\link[BioCro]{partial_run_biocro}} + via its \code{arg_names} input. Note that argument names passed to + \code{partial_run_biocro} can technically include members of the + \code{drivers}, but it is unlikely that the value of a driver would be + varied during an optimization, so the argument names are not allowed + to include columns in the \code{drivers}. \item Ensuring that the optional \code{dependent_arg_function}, \code{post_process_function}, and \code{extra_penalty_function} @@ -544,22 +551,20 @@ the same order as in \code{independent_arg_names}), and \code{lambda} is the value of the regularization parameter. - The \code{return_terms} argument determines the return value of - \code{obj_fun}. When \code{return_terms} is \code{FALSE}, \code{obj_fun} - returns values of the error metric \eqn{E}. When \code{return_terms} is - \code{TRUE}, \code{obj_fun} returns a list including each individual term of - the total error metric. + When \code{return_terms} is \code{FALSE}, \code{obj_fun} returns values of the + error metric \eqn{E}. When \code{return_terms} is \code{TRUE}, \code{obj_fun} + returns a list including each individual term of the total error metric. During optimization, \code{return_terms} should always be \code{FALSE}. Setting it to \code{TRUE} can be useful for troubleshooting, or for diagnostics such as checking the quality of fit for each of the data-driver pairs. - The \code{debug_mode} argument determines whether \code{obj_fun} is running in - debug mode. In debug mode, each time \code{obj_fun} is called, it will print - the values of \code{x} and the error metric to the R terminal. This can be - useful when troubleshooting a problem with an optimization, since it provides - a record of any problematic parameter combinations. When setting + When \code{debug_mode} is \code{TRUE}, \code{obj_fun} operates in "debug + mode." In debug mode, each time \code{obj_fun} is called, it will print the + values of \code{x} and the error metric to the R terminal. This can be useful + when troubleshooting a problem with an optimization, since it provides a + record of any problematic parameter combinations. When setting \code{debug_mode} to \code{TRUE}, also consider using \code{\link[base]{sink}} to write the outputs to a file instead of the R terminal. In that case, there will still be a record even if R crashes. @@ -690,12 +695,12 @@ if (require(BioCro)) { # objective function increases, indicating a lower degree of agreement between # the model and the observed data. Here we will call `obj_fun` in debug mode, # which will automatically print the value of the error metric. - cat('\nError metric calculated by doubling the initial argument values:\n') + cat('\nError metric calculated by doubling the original argument values:\n') error_metric <- obj_fun(2 * as.numeric(independent_args), debug_mode = TRUE) # We can also see the values of each term that makes up the error metric; # again, we will call `obj_fun` in debug mode for automatic printing. - cat('\nError metric terms calculated by doubling the initial argument values:\n') + cat('\nError metric terms calculated by doubling the original argument values:\n') error_terms <- obj_fun(2 * as.numeric(independent_args), return_terms = TRUE, debug_mode = TRUE) }