From 03e156906f99f7da2f3abc25458e543299f570db Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 14:14:54 -0500 Subject: [PATCH 01/54] Add SoyFACE biomass data --- NEWS.md | 4 +++ data/soyface_biomass.R | 58 ++++++++++++++++++++++++++++++++++++++++++ man/soyface_biomass.Rd | 48 ++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 data/soyface_biomass.R create mode 100644 man/soyface_biomass.Rd diff --git a/NEWS.md b/NEWS.md index a074d08..1aaf1d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,10 @@ In the case of a hotfix, a short section headed by the new release number should be directly added to this file to describe the related changes. --> +# UNRELEASED + +- Added 2002 and 2005 SoyFACE biomass data. + # Changes in BioCroValidation Version 0.1.0 - This is the first version of BioCroValidation. At this point, the package is diff --git a/data/soyface_biomass.R b/data/soyface_biomass.R new file mode 100644 index 0000000..bb4e1b9 --- /dev/null +++ b/data/soyface_biomass.R @@ -0,0 +1,58 @@ +soyface_biomass <- list( + ambient_2002 = utils::read.table( + textConnection(' + DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha + 179 0.180284339 0.085244969 0 + 189 0.554461942 0.418853893 0 + 203 1.326552931 1.711067366 0.000317148 + 217 1.697944007 2.892825897 0.079396325 + 231 1.807742782 3.68591426 1.554571303 + 246 1.578813648 3.745260717 3.97601356 + 259 0.947537773 3.618401575 6.544650656 + 288 0 2.305701225 7.008967628 + '), + header = TRUE + ), + ambient_2005 = utils::read.table( + textConnection(' + DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha + 172 0.222271875 0.188803125 0 + 186 0.8460375 0.85220625 0 + 200 1.184465625 1.61896875 0 + 214 2.218059375 4.043615625 0.29925 + 228 2.147446875 4.477725 2.304553125 + 242 1.51948125 3.8920875 5.532778125 + 256 0.06575625 2.89905 5.371078125 + 270 0 2.1756 6.372253125 + '), + header = TRUE + ), + ambient_2002_std = utils::read.table( + textConnection(' + DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha + 179 0.04081555 0.017079737 0 + 189 0.163863274 0.138449025 0 + 203 0.133774434 0.183710759 0.000549316 + 217 0.228326658 0.448744065 0.030989998 + 231 0.202475421 0.453447471 0.218402544 + 246 0.075475165 0.275321356 0.573548828 + 259 0.344550033 0.151045378 0.743440701 + 288 0 0.148389261 0.141828617 + '), + header = TRUE + ), + ambient_2005_std = utils::read.table( + textConnection(' + DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha + 172 0.032896589 0.014318136 0 + 186 0.146798299 0.198830061 0 + 200 0.338074288 0.605286253 0 + 214 0.152175913 0.559874052 0.164275197 + 228 0.119077589 0.306744644 0.434148074 + 242 0.512808699 0.379108485 0.588476976 + 256 0.061686243 0.220823981 0.52004438 + 270 0 0.243254729 0.633090858 + '), + header = TRUE + ) +) diff --git a/man/soyface_biomass.Rd b/man/soyface_biomass.Rd new file mode 100644 index 0000000..adf073e --- /dev/null +++ b/man/soyface_biomass.Rd @@ -0,0 +1,48 @@ +\name{soyface_biomass} + +\docType{data} + +\alias{soyface_biomass} + +\title{SoyFACE Soybean Biomass Data} + +\description{ + Two years of soybean biomass data collected at the SoyFACE facility in + Champaign, IL during the 2002 and 2005 field seasons. +} + +\usage{soyface_biomass} + +\format{ + A list of four named elements, where each element is a data frame with the + following columns: + \itemize{ + \item \code{DOY}: The day of year + \item \code{Leaf_Mg_per_ha}: Leaf biomass per area expressed in Mg / ha + \item \code{Stem_Mg_per_ha}: Stem biomass per area expressed in Mg / ha + \item \code{Pod_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + } + + The elements named \code{ambient_2002} and \code{ambient_2005} represent mean + biomass values measured from plants grown in ambient CO2 conditions during + 2002 and 2005, respectively. + + The elements named \code{ambient_2002_std} and \code{ambient_2005_std} + represent the standard deviation of biomass values measured from plants grown + in ambient CO2 conditions during 2002 and 2005, respectively. +} + +\source{ + This data was obtained from several files available from the + \href{https://github.com/cropsinsilico/soybean-biocro}{Soybean-BioCro GitHub repository}: + \itemize{ + \item \code{Data/SoyFACE_data/2002_ambient_biomass.csv} + \item \code{Data/SoyFACE_data/2005_ambient_biomass.csv} + \item \code{Data/SoyFACE_data/2002_ambient_biomass_std.csv} + \item \code{Data/SoyFACE_data/2005_ambient_biomass_std.csv} + } + + See that repository for more information. +} + +\keyword{datasets} From 344e1cf601043719ba5d584f9b9713d2656d8c06 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 15:48:00 -0500 Subject: [PATCH 02/54] Start writing `objective_function` --- NAMESPACE | 1 + R/objective_function.R | 113 +++++++++++++++++++++++ tests/testthat/test-objective_function.R | 73 +++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 R/objective_function.R create mode 100644 tests/testthat/test-objective_function.R diff --git a/NAMESPACE b/NAMESPACE index eb1ea50..ba5cd56 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,2 @@ +export(objective_function) export(write_model) diff --git a/R/objective_function.R b/R/objective_function.R new file mode 100644 index 0000000..d9e9cf8 --- /dev/null +++ b/R/objective_function.R @@ -0,0 +1,113 @@ +# Helping function for checking the data-driver pairs; will throw an error if +# a problem is detected, and will otherwise be silent with no return value. +check_data_driver_pairs <- function(model_definition, data_driver_pairs) { + if (is.null(names(data_driver_pairs))) { + stop('`data_driver_pairs` must have names') + } + + valid_definitions <- sapply(data_driver_pairs, function(ddp) { + BioCro::validate_dynamical_system_inputs( + model_definition[['initial_values']], + model_definition[['parameters']], + ddp[['drivers']], + model_definition[['direct_modules']], + model_definition[['differential_modules']], + verbose = FALSE + ) + }) + + if (any(!valid_definitions)) { + invalid_ddp <- names(data_driver_pairs)[!valid_definitions] + + msg <- paste( + 'The following drivers did not form a valid dynamical system:', + paste(invalid_ddp, collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for getting a model runner +get_model_runner <- function(model_definition, independent_arg_names, ddp) { + tryCatch( + BioCro::partial_run_biocro( + model_definition[['initial_values']], + model_definition[['parameters']], + ddp[['drivers']], + model_definition[['direct_modules']], + model_definition[['differential_modules']], + model_definition[['ode_solver']], + independent_arg_names + ), + error = function(e) {as.character(e)} + ) +} + +# Helping function for checking the model runners; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_runners <- function(model_runners, initial_independent_arg_values) { + # First check for runners that could not be created + bad_runners <- sapply(model_runners, is.character) + + if (any(bad_runners)) { + bad_runner_names <- names(model_runners)[bad_runners] + bad_runner_msg <- as.character(model_runners[bad_runners]) + + msg <- paste0( + 'Model runners could not be created for the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + # Now check for runners that cannot be evaluated + runner_eval_msg <- sapply(model_runners, function(runner) { + runner_result <- tryCatch( + runner(initial_independent_arg_values), + error = function(e) {as.character(e)} + ) + + if (is.character(runner_result)) { + runner_result + } else { + '' + } + }) + + if (any(runner_eval_msg != '')) { + bad_runner_names <- names(model_runners)[runner_eval_msg != ''] + bad_runner_msg <- runner_eval_msg[runner_eval_msg != ''] + + msg <- paste0( + 'The model could not be run with the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +objective_function <- function( + model_definition, + data_driver_pairs, + independent_arg_names, + initial_independent_arg_values +) +{ + # Check the data-driver pairs + check_data_driver_pairs(model_definition, data_driver_pairs) + + # Get the model runners + model_runners <- lapply(data_driver_pairs, function(ddp) { + get_model_runner(model_definition, independent_arg_names, ddp) + }) + + # Check the model runners + check_runners(model_runners, initial_independent_arg_values) +} diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R new file mode 100644 index 0000000..3d6e177 --- /dev/null +++ b/tests/testthat/test-objective_function.R @@ -0,0 +1,73 @@ +# Specify key inputs to use for these tests +model <- BioCro::soybean + +ddps <- list( + ambient_2002 = list( + data = soyface_biomass[['ambient_2002']], + drivers = BioCro::soybean_weather[['2002']] + ), + ambient_2005 = list( + data = soyface_biomass[['ambient_2005']], + drivers = BioCro::soybean_weather[['2005']] + ) +) + +independent_arg_names <- c('alphaLeaf', 'betaLeaf') + +initial_independent_arg_values <- with(BioCro::soybean[['parameters']], { + c(alphaLeaf, betaLeaf) +}) + +# Run tests +test_that('An objective function can be created', { + obj_fun <- expect_silent( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values + ) + ) +}) + +test_that('Bad definitions are detected', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2005$drivers$temp = NULL}), + independent_arg_names, + initial_independent_arg_values + ), + 'The following drivers did not form a valid dynamical system: ambient_2005' + ) +}) + +test_that('Bad argument names are detected', { + expect_error( + objective_function( + model, + ddps, + c(independent_arg_names, 'bad_arg_name'), + initial_independent_arg_values + ), + 'Model runners could not be created for the following drivers: +ambient_2002: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers` +ambient_2005: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers`', + fixed = TRUE + ) +}) + +test_that('Bad initial argument values are detected', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + c() + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(initial_independent_arg_values): The unlisted `x` argument (`unlist(x)`) does not have the correct number of elements: required = 2, actual = 0 +ambient_2005: Error in runner(initial_independent_arg_values): The unlisted `x` argument (`unlist(x)`) does not have the correct number of elements: required = 2, actual = 0', + fixed = TRUE + ) +}) From 3433ccc058313fa56977894601bea4b57f2ae46e Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 16:41:27 -0500 Subject: [PATCH 03/54] Enable dependent parameter values --- R/objective_function.R | 110 +++++++++++++++++++---- tests/testthat/test-objective_function.R | 60 +++++++++++-- 2 files changed, 148 insertions(+), 22 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index d9e9cf8..200fa78 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -30,18 +30,85 @@ check_data_driver_pairs <- function(model_definition, data_driver_pairs) { return(invisible(NULL)) } -# Helping function for getting a model runner -get_model_runner <- function(model_definition, independent_arg_names, ddp) { - tryCatch( - BioCro::partial_run_biocro( - model_definition[['initial_values']], - model_definition[['parameters']], - ddp[['drivers']], - model_definition[['direct_modules']], - model_definition[['differential_modules']], - model_definition[['ode_solver']], - independent_arg_names - ), +# Helping function for checking the independent argument names and initial +# values +check_independent_arguments <- function( + independent_arg_names, + initial_independent_arg_values +) +{ + if (length(independent_arg_names) != length(initial_independent_arg_values)) { + stop('`independent_arg_names` and `initial_independent_arg_values` must have the same length') + } + + if (is.null(names(initial_independent_arg_values))) { + stop('`initial_independent_arg_values` must have names') + } + + if (any(!names(initial_independent_arg_values) %in% independent_arg_names)) { + bad_arg <- !names(initial_independent_arg_values) %in% independent_arg_names + + msg <- paste( + 'The following arguments are included in `initial_independent_arg_values`', + 'but not `independent_arg_names`:', + paste(names(initial_independent_arg_values)[bad_arg], collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for getting a model runner; if the runner cannot be created, +# an error message will be returned instead +get_model_runner <- function( + model_definition, + independent_arg_names, + initial_independent_arg_values, + dependent_arg_function, + ddp +) +{ + # Get the full list of arg_names + arg_names <- if (is.null(dependent_arg_function)) { + independent_arg_names + } else { + dependent_arg_values <- + dependent_arg_function(initial_independent_arg_values) + + c(independent_arg_names, names(dependent_arg_values)) + } + + # Build the runner + tryCatch({ + partial_func <- BioCro::partial_run_biocro( + model_definition[['initial_values']], + model_definition[['parameters']], + ddp[['drivers']], + model_definition[['direct_modules']], + model_definition[['differential_modules']], + model_definition[['ode_solver']], + arg_names + ) + + function(x) { + if (!is.numeric(x)) { + stop('`x` must be numeric') + } + + x_for_partial <- if (is.null(dependent_arg_function)) { + x + } else { + x_for_dependent_arg_func <- + stats::setNames(x, independent_arg_names) + + c(x, as.numeric(dependent_arg_function(x_for_dependent_arg_func))) + } + + partial_func(x_for_partial) + } + }, error = function(e) {as.character(e)} ) } @@ -67,7 +134,7 @@ check_runners <- function(model_runners, initial_independent_arg_values) { # Now check for runners that cannot be evaluated runner_eval_msg <- sapply(model_runners, function(runner) { runner_result <- tryCatch( - runner(initial_independent_arg_values), + runner(as.numeric(initial_independent_arg_values)), error = function(e) {as.character(e)} ) @@ -97,15 +164,28 @@ objective_function <- function( model_definition, data_driver_pairs, independent_arg_names, - initial_independent_arg_values + initial_independent_arg_values, + dependent_arg_function = NULL ) { # Check the data-driver pairs check_data_driver_pairs(model_definition, data_driver_pairs) + # Check the independent arguments + check_independent_arguments( + independent_arg_names, + initial_independent_arg_values + ) + # Get the model runners model_runners <- lapply(data_driver_pairs, function(ddp) { - get_model_runner(model_definition, independent_arg_names, ddp) + get_model_runner( + model_definition, + independent_arg_names, + initial_independent_arg_values, + dependent_arg_function, + ddp + ) }) # Check the model runners diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 3d6e177..cc5c45a 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -15,9 +15,13 @@ ddps <- list( independent_arg_names <- c('alphaLeaf', 'betaLeaf') initial_independent_arg_values <- with(BioCro::soybean[['parameters']], { - c(alphaLeaf, betaLeaf) + list(alphaLeaf = alphaLeaf, betaLeaf = betaLeaf) }) +dependent_arg_function <- function(x) { + list(alphaStem = x[['alphaLeaf']]) +} + # Run tests test_that('An objective function can be created', { obj_fun <- expect_silent( @@ -28,6 +32,16 @@ test_that('An objective function can be created', { initial_independent_arg_values ) ) + + obj_fun <- expect_silent( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values, + dependent_arg_function + ) + ) }) test_that('Bad definitions are detected', { @@ -42,13 +56,45 @@ test_that('Bad definitions are detected', { ) }) +test_that('Independent argument names must be consistent', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + c() + ), + '`independent_arg_names` and `initial_independent_arg_values` must have the same length' + ) + + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + list(arg1 = 1, arg2 = 2) + ), + 'The following arguments are included in `initial_independent_arg_values` but not `independent_arg_names`: arg1, arg2' + ) + + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + as.numeric(initial_independent_arg_values) + ), + '`initial_independent_arg_values` must have names' + ) +}) + test_that('Bad argument names are detected', { expect_error( objective_function( model, ddps, c(independent_arg_names, 'bad_arg_name'), - initial_independent_arg_values + c(initial_independent_arg_values, list(bad_arg_name = 1)) ), 'Model runners could not be created for the following drivers: ambient_2002: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers` @@ -57,17 +103,17 @@ ambient_2005: Error: `bad_arg_name` from `arg_names` is not in the `initial_valu ) }) -test_that('Bad initial argument values are detected', { +test_that('Model failures are detected', { expect_error( objective_function( - model, + within(model, {parameters$lnfun = 1}), ddps, independent_arg_names, - c() + initial_independent_arg_values ), 'The model could not be run with the following drivers: -ambient_2002: Error in runner(initial_independent_arg_values): The unlisted `x` argument (`unlist(x)`) does not have the correct number of elements: required = 2, actual = 0 -ambient_2005: Error in runner(initial_independent_arg_values): The unlisted `x` argument (`unlist(x)`) does not have the correct number of elements: required = 2, actual = 0', +ambient_2002: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported. +ambient_2005: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported.', fixed = TRUE ) }) From 952fa26dce15fed292e21725580ceb2448a6b101 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 17:33:24 -0500 Subject: [PATCH 04/54] Enable post-processing simulation outputs and data definitions --- R/objective_function.R | 121 ++++++++++++++++++++++- tests/testthat/test-objective_function.R | 91 +++++++++++++++-- 2 files changed, 198 insertions(+), 14 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 200fa78..906a8e7 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -5,6 +5,38 @@ check_data_driver_pairs <- function(model_definition, data_driver_pairs) { stop('`data_driver_pairs` must have names') } + has_elements <- sapply(data_driver_pairs, function(x) { + 'drivers' %in% names(x) && 'data' %in% names(x) + }) + + if (any(!has_elements)) { + missing_elements <- names(data_driver_pairs)[!has_elements] + + msg <- paste( + 'The following data-driver pairs are missing a `drivers` element,', + 'a `data` element, or both:', + paste(missing_elements, collapse = ', ') + ) + + stop(msg) + } + + has_time <- sapply(data_driver_pairs, function(x) { + 'time' %in% colnames(x[['data']]) + }) + + if (any(!has_time)) { + missing_time <- names(data_driver_pairs)[!has_time] + + msg <- paste( + 'The following data-driver pairs are missing a `time` column', + 'in their `data` element:', + paste(missing_time, collapse = ', ') + ) + + stop(msg) + } + valid_definitions <- sapply(data_driver_pairs, function(ddp) { BioCro::validate_dynamical_system_inputs( model_definition[['initial_values']], @@ -67,6 +99,7 @@ get_model_runner <- function( independent_arg_names, initial_independent_arg_values, dependent_arg_function, + post_process_function, ddp ) { @@ -106,7 +139,13 @@ get_model_runner <- function( c(x, as.numeric(dependent_arg_function(x_for_dependent_arg_func))) } - partial_func(x_for_partial) + initial_res <- partial_func(x_for_partial) + + if (is.null(post_process_function)) { + initial_res + } else { + post_process_function(initial_res) + } } }, error = function(e) {as.character(e)} @@ -115,7 +154,12 @@ get_model_runner <- function( # Helping function for checking the model runners; will throw an error if a # problem is detected, and will otherwise be silent with no return value. -check_runners <- function(model_runners, initial_independent_arg_values) { +check_runners <- function( + model_runners, + initial_independent_arg_values, + full_data_definitions +) +{ # First check for runners that could not be created bad_runners <- sapply(model_runners, is.character) @@ -157,15 +201,75 @@ check_runners <- function(model_runners, initial_independent_arg_values) { stop(msg) } + # Now make sure each runner produces the necessary columns in its output + expected_columns <- as.character(full_data_definitions) + + missing_columns <- lapply(model_runners, function(runner) { + runner_result <- runner(as.numeric(initial_independent_arg_values)) + expected_columns[!expected_columns %in% colnames(runner_result)] + }) + + bad_outputs <- sapply(missing_columns, function(x) { + length(x) > 0 + }) + + if (any(bad_outputs)) { + msg <- 'Some data columns were missing from the following runner outputs:' + + for (i in seq_along(bad_outputs)) { + if (bad_outputs[i]) { + msg <- append( + msg, + paste0( + names(model_runners)[i], ': ', + paste(missing_columns[[i]], collapse = ', ') + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + return(invisible(NULL)) } +# Helping function for getting a "data definition list," which specifies the +# names of the `data` columns as they appear in the simulation output +get_data_definition_list <- function(data_driver_pairs, user_data_definitions) { + # First get all the column names found in the observed data + all_data_colnames <- + lapply(data_driver_pairs, function(x) {colnames(x[['data']])}) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- all_data_colnames[!duplicated(all_data_colnames)] + + # Remove the `time` column + all_data_colnames <- all_data_colnames[all_data_colnames != 'time'] + + # Build the data definition list + data_definitions <- lapply(all_data_colnames, function(cn) { + if (cn %in% names(user_data_definitions)) { + user_data_definitions[[cn]] + } else { + cn + } + }) + + names(data_definitions) <- all_data_colnames + + data_definitions +} + objective_function <- function( model_definition, data_driver_pairs, independent_arg_names, initial_independent_arg_values, - dependent_arg_function = NULL + data_definitions, + dependent_arg_function = NULL, + post_process_function = NULL ) { # Check the data-driver pairs @@ -184,10 +288,19 @@ objective_function <- function( independent_arg_names, initial_independent_arg_values, dependent_arg_function, + post_process_function, ddp ) }) + # Get the full data definition list + full_data_definitions <- + get_data_definition_list(data_driver_pairs, data_definitions) + # Check the model runners - check_runners(model_runners, initial_independent_arg_values) + check_runners( + model_runners, + initial_independent_arg_values, + full_data_definitions + ) } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index cc5c45a..7d79e84 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -1,13 +1,14 @@ # Specify key inputs to use for these tests model <- BioCro::soybean +model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] ddps <- list( ambient_2002 = list( - data = soyface_biomass[['ambient_2002']], + data = within(soyface_biomass[['ambient_2002']], {time = (DOY - 1) * 24.0; DOY = NULL}), drivers = BioCro::soybean_weather[['2002']] ), ambient_2005 = list( - data = soyface_biomass[['ambient_2005']], + data = within(soyface_biomass[['ambient_2005']], {time = (DOY - 1) * 24.0; DOY = NULL}), drivers = BioCro::soybean_weather[['2005']] ) ) @@ -18,10 +19,20 @@ initial_independent_arg_values <- with(BioCro::soybean[['parameters']], { list(alphaLeaf = alphaLeaf, betaLeaf = betaLeaf) }) +data_definitions <- list( + Leaf_Mg_per_ha = 'Leaf', + Stem_Mg_per_ha = 'Stem', + Pod_Mg_per_ha = 'Pod' +) + dependent_arg_function <- function(x) { list(alphaStem = x[['alphaLeaf']]) } +post_process_function <- function(x) { + within(x, {Pod = Grain + Shell}) +} + # Run tests test_that('An objective function can be created', { obj_fun <- expect_silent( @@ -29,7 +40,9 @@ test_that('An objective function can be created', { model, ddps, independent_arg_names, - initial_independent_arg_values + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function ) ) @@ -39,7 +52,9 @@ test_that('An objective function can be created', { ddps, independent_arg_names, initial_independent_arg_values, - dependent_arg_function + data_definitions, + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function ) ) }) @@ -50,7 +65,9 @@ test_that('Bad definitions are detected', { model, within(ddps, {ambient_2005$drivers$temp = NULL}), independent_arg_names, - initial_independent_arg_values + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function ), 'The following drivers did not form a valid dynamical system: ambient_2005' ) @@ -62,7 +79,9 @@ test_that('Independent argument names must be consistent', { model, ddps, independent_arg_names, - c() + c(), + data_definitions, + post_process_function = post_process_function ), '`independent_arg_names` and `initial_independent_arg_values` must have the same length' ) @@ -72,7 +91,9 @@ test_that('Independent argument names must be consistent', { model, ddps, independent_arg_names, - list(arg1 = 1, arg2 = 2) + list(arg1 = 1, arg2 = 2), + data_definitions, + post_process_function = post_process_function ), 'The following arguments are included in `initial_independent_arg_values` but not `independent_arg_names`: arg1, arg2' ) @@ -82,7 +103,9 @@ test_that('Independent argument names must be consistent', { model, ddps, independent_arg_names, - as.numeric(initial_independent_arg_values) + as.numeric(initial_independent_arg_values), + data_definitions, + post_process_function = post_process_function ), '`initial_independent_arg_values` must have names' ) @@ -94,7 +117,9 @@ test_that('Bad argument names are detected', { model, ddps, c(independent_arg_names, 'bad_arg_name'), - c(initial_independent_arg_values, list(bad_arg_name = 1)) + c(initial_independent_arg_values, list(bad_arg_name = 1)), + data_definitions, + post_process_function = post_process_function ), 'Model runners could not be created for the following drivers: ambient_2002: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers` @@ -109,7 +134,9 @@ test_that('Model failures are detected', { within(model, {parameters$lnfun = 1}), ddps, independent_arg_names, - initial_independent_arg_values + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function ), 'The model could not be run with the following drivers: ambient_2002: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported. @@ -117,3 +144,47 @@ ambient_2005: Error in as.data.frame(.Call(R_run_biocro, initial_values, paramet fixed = TRUE ) }) + +test_that('Data-driver pairs must be complete', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data = NULL; ambient_2005$drivers = NULL}), + independent_arg_names, + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function + ), + 'The following data-driver pairs are missing a `drivers` element, a `data` element, or both: ambient_2002, ambient_2005' + ) +}) + +test_that('Data must have a `time` column', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data$time = NULL}), + independent_arg_names, + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function + ), + 'The following data-driver pairs are missing a `time` column in their `data` element: ambient_2002' + ) +}) + +test_that('Missing simulation outputs are detected', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values, + data_definitions + ), + 'Some data columns were missing from the following runner outputs: +ambient_2002: Pod +ambient_2005: Pod', + fixed = TRUE + ) +}) From f1f626fef897c776122a74b5518f5d72d26d4fee Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 22:13:33 -0500 Subject: [PATCH 05/54] Find time indices --- R/objective_function.R | 108 ++++++++++++++++++++++- tests/testthat/test-objective_function.R | 27 ++++++ 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 906a8e7..c96c588 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -155,6 +155,7 @@ get_model_runner <- function( # Helping function for checking the model runners; will throw an error if a # problem is detected, and will otherwise be silent with no return value. check_runners <- function( + data_driver_pairs, model_runners, initial_independent_arg_values, full_data_definitions @@ -201,12 +202,16 @@ check_runners <- function( stop(msg) } + # Run each runner + runner_results <- lapply(model_runners, function(runner) { + runner(as.numeric(initial_independent_arg_values)) + }) + # Now make sure each runner produces the necessary columns in its output expected_columns <- as.character(full_data_definitions) - missing_columns <- lapply(model_runners, function(runner) { - runner_result <- runner(as.numeric(initial_independent_arg_values)) - expected_columns[!expected_columns %in% colnames(runner_result)] + missing_columns <- lapply(runner_results, function(res) { + expected_columns[!expected_columns %in% colnames(res)] }) bad_outputs <- sapply(missing_columns, function(x) { @@ -231,6 +236,44 @@ check_runners <- function( stop(paste(msg, collapse = '\n')) } + # Make sure the output from each runner includes the observed times + times_out_of_range <- lapply(seq_along(runner_results), function(i) { + res <- runner_results[[i]] + + min_time <- min(res[['time']]) + max_time <- max(res[['time']]) + + data_times <- data_driver_pairs[[i]][['data']][['time']] + + oor <- sapply(data_times, function(datat) { + datat < min_time || datat > max_time + }) + + data_times[oor] + }) + + bad_times <- sapply(times_out_of_range, function(x) { + length(x) > 0 + }) + + if (any(bad_times)) { + msg <- 'Some observed times were missing from the following runner outputs:' + + for (i in seq_along(bad_times)) { + if (bad_times[i]) { + msg <- append( + msg, + paste0( + names(model_runners)[i], ': ', + paste(times_out_of_range[[i]], collapse = ', ') + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + return(invisible(NULL)) } @@ -262,6 +305,53 @@ get_data_definition_list <- function(data_driver_pairs, user_data_definitions) { data_definitions } +# Helping function for converting each data table to a "long form." +get_long_form_data <- function(data_driver_pairs, full_data_definitions) { + lapply(data_driver_pairs, function(ddp) { + short_form_data <- ddp[['data']] + + data_column_names <- colnames(short_form_data) + data_column_names <- data_column_names[data_column_names != 'time'] + + long_form_data_list <- lapply(data_column_names, function(cn) { + data.frame( + time = short_form_data[, 'time'], + quantity_name = full_data_definitions[[cn]], + quantity_value = short_form_data[, cn], + stringsAsFactors = FALSE + ) + }) + + long_form_data <- do.call(rbind, long_form_data_list) + + long_form_data[!is.na(long_form_data[['quantity_value']]), ] + }) +} + +# Helping function for getting time indices +add_time_indices <- function( + model_runners, + initial_independent_arg_values, + long_form_data +) +{ + for (i in seq_along(long_form_data)) { + runner <- model_runners[[i]] + res <- runner(as.numeric(initial_independent_arg_values)) + + dataf <- long_form_data[[i]] + indices <- sapply(dataf[, 'time'], function(x) { + tdiff <- abs(res[, 'time'] - x) + which(tdiff == min(tdiff)) + }) + + long_form_data[[i]][, 'time_index'] <- indices + long_form_data[[i]][, 'expected_npts'] <- nrow(res) + } + + long_form_data +} + objective_function <- function( model_definition, data_driver_pairs, @@ -299,8 +389,20 @@ objective_function <- function( # Check the model runners check_runners( + data_driver_pairs, model_runners, initial_independent_arg_values, full_data_definitions ) + + # Get the long-form data + long_form_data <- + get_long_form_data(data_driver_pairs, full_data_definitions) + + # Find indices corresponding to the measured time points + long_form_data <- add_time_indices( + model_runners, + initial_independent_arg_values, + long_form_data + ) } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 7d79e84..b682967 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -45,6 +45,17 @@ test_that('An objective function can be created', { post_process_function = post_process_function ) ) + + obj_fun <- expect_silent( + objective_function( + model, + ddps[1], + independent_arg_names, + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function + ) + ) obj_fun <- expect_silent( objective_function( @@ -188,3 +199,19 @@ ambient_2005: Pod', fixed = TRUE ) }) + +test_that('Out-of-range times are detected', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data$time <- ambient_2002$data$time + 1e5}), + independent_arg_names, + initial_independent_arg_values, + data_definitions, + post_process_function = post_process_function + ), + 'Some observed times were missing from the following runner outputs: +ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', + fixed = TRUE + ) +}) From 8b9d731ef0616fdc0a12167b6f0395e65302512f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 2 May 2025 23:38:39 -0500 Subject: [PATCH 06/54] Return the total error function --- R/objective_function.R | 144 ++++++++++++++++++++++- tests/testthat/test-objective_function.R | 70 ++++++++++- 2 files changed, 208 insertions(+), 6 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index c96c588..faadbbf 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -279,7 +279,8 @@ check_runners <- function( # Helping function for getting a "data definition list," which specifies the # names of the `data` columns as they appear in the simulation output -get_data_definition_list <- function(data_driver_pairs, user_data_definitions) { +get_data_definition_list <- function(data_driver_pairs, user_data_definitions) +{ # First get all the column names found in the observed data all_data_colnames <- lapply(data_driver_pairs, function(x) {colnames(x[['data']])}) @@ -337,29 +338,129 @@ add_time_indices <- function( { for (i in seq_along(long_form_data)) { runner <- model_runners[[i]] - res <- runner(as.numeric(initial_independent_arg_values)) + res <- runner(as.numeric(initial_independent_arg_values)) dataf <- long_form_data[[i]] + indices <- sapply(dataf[, 'time'], function(x) { tdiff <- abs(res[, 'time'] - x) which(tdiff == min(tdiff)) }) - long_form_data[[i]][, 'time_index'] <- indices + long_form_data[[i]][, 'time_index'] <- indices long_form_data[[i]][, 'expected_npts'] <- nrow(res) } long_form_data } +# Helping function that processes and checks the quantity weights +process_quantity_weights <- function(quantity_weights, long_form_data) { + # First make sure that weights have been provided for all measured + # quantities + all_data_colnames <- lapply(long_form_data, function(x) { + unique(x[, 'quantity_name']) + }) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- unique(all_data_colnames) + + weight_was_supplied <- sapply(all_data_colnames, function(cn) { + cn %in% names(quantity_weights) + }) + + if (any(!weight_was_supplied)) { + missing_weights <- all_data_colnames[!weight_was_supplied] + + msg <- paste( + 'Weights were not supplied for the following measured quantities:', + paste(missing_weights, collapse = ', ') + ) + + stop(msg) + } + + # Now make sure all the weights have length 2 + lapply(quantity_weights, function(wt) { + rep_len(wt, 2) + }) +} + +# Helping function that calculates one normalization factor +one_norm <- function(long_form_data_table, qname, normalization_method) { + if (tolower(normalization_method) == 'mean_max') { + npts <- sum(long_form_data_table[, 'quantity_name'] == qname) + qmax <- max(long_form_data_table[long_form_data_table[, 'quantity_name'] == qname, 'quantity_value']) + npts * qmax^2 + } else { + stop('Unsupported normalization_method: ', normalization_method) + } +} + +# Helping function that calculates one error +one_error <- function(observed, predicted, weight, normalization) { + weight_multiplier <- if (observed <= predicted) { + weight[1] + } else { + weight[2] + } + + (observed - predicted)^2 * weight_multiplier / normalization +} + +# Helping function that calculates an error value from a simulation result +error_from_res <- function( + simulation_result, + long_form_data_table, + quantity_weights, + normalization_method, + extra_penalty_function +) +{ + # If the simulation did not finish, return a very high value + expected_npts <- long_form_data_table[1, 'expected_npts'] + + if (nrow(simulation_result) < expected_npts) { + return(1e6) + } + + # Calculate any user-specified penalties + penalty <- if (is.null(extra_penalty_function)) { + 0.0 + } else { + extra_penalty_function(simulation_result) + } + + # Calculate the error terms + n_obs <- nrow(long_form_data_table) + + errors <- sapply(seq_len(n_obs), function(i) { + qname <- as.character(long_form_data_table[i, 'quantity_name']) + obs <- long_form_data_table[i, 'quantity_value'] + indx <- long_form_data_table[i, 'time_index'] + pred <- simulation_result[indx, qname] + wt <- quantity_weights[[qname]] + norm <- one_norm(long_form_data_table, qname, normalization_method) + + one_error(obs, pred, wt, norm) + }) + + # Return the sum of the penalty and error terms + penalty + sum(errors) +} + objective_function <- function( model_definition, data_driver_pairs, independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, dependent_arg_function = NULL, - post_process_function = NULL + post_process_function = NULL, + extra_penalty_function = NULL ) { # Check the data-driver pairs @@ -405,4 +506,39 @@ objective_function <- function( initial_independent_arg_values, long_form_data ) + + # Process the quantity weights + processed_weights <- process_quantity_weights(quantity_weights, long_form_data) + + # Create and test the total error function + total_error_function <- function(x) { + errors <- sapply(seq_along(model_runners), function(i) { + runner <- model_runners[[i]] + res <- runner(x) + + error_from_res( + res, + long_form_data[[i]], + processed_weights, + normalization_method, + extra_penalty_function + ) + }) + + sum(errors) + } + + initial_error <- + total_error_function(as.numeric(initial_independent_arg_values)) + + if (!is.finite(initial_error)) { + stop( + 'The objective function did not return a finite value when using', + 'the initial argument values: ', + initial_error + ) + } + + # Return it + total_error_function } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index b682967..1cbf885 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -33,6 +33,14 @@ post_process_function <- function(x) { within(x, {Pod = Grain + Shell}) } +quantity_weights <- list( + Leaf = 0.5, + Stem = 0.5, + Pod = 1 +) + +normalization_method <- 'mean_max' + # Run tests test_that('An objective function can be created', { obj_fun <- expect_silent( @@ -42,10 +50,12 @@ test_that('An objective function can be created', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ) ) - + obj_fun <- expect_silent( objective_function( model, @@ -53,6 +63,8 @@ test_that('An objective function can be created', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ) ) @@ -64,6 +76,8 @@ test_that('An objective function can be created', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, dependent_arg_function = dependent_arg_function, post_process_function = post_process_function ) @@ -78,6 +92,8 @@ test_that('Bad definitions are detected', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'The following drivers did not form a valid dynamical system: ambient_2005' @@ -92,6 +108,8 @@ test_that('Independent argument names must be consistent', { independent_arg_names, c(), data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), '`independent_arg_names` and `initial_independent_arg_values` must have the same length' @@ -104,6 +122,8 @@ test_that('Independent argument names must be consistent', { independent_arg_names, list(arg1 = 1, arg2 = 2), data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'The following arguments are included in `initial_independent_arg_values` but not `independent_arg_names`: arg1, arg2' @@ -116,6 +136,8 @@ test_that('Independent argument names must be consistent', { independent_arg_names, as.numeric(initial_independent_arg_values), data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), '`initial_independent_arg_values` must have names' @@ -130,6 +152,8 @@ test_that('Bad argument names are detected', { c(independent_arg_names, 'bad_arg_name'), c(initial_independent_arg_values, list(bad_arg_name = 1)), data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'Model runners could not be created for the following drivers: @@ -147,6 +171,8 @@ test_that('Model failures are detected', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'The model could not be run with the following drivers: @@ -164,6 +190,8 @@ test_that('Data-driver pairs must be complete', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'The following data-driver pairs are missing a `drivers` element, a `data` element, or both: ambient_2002, ambient_2005' @@ -178,6 +206,8 @@ test_that('Data must have a `time` column', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'The following data-driver pairs are missing a `time` column in their `data` element: ambient_2002' @@ -191,7 +221,9 @@ test_that('Missing simulation outputs are detected', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions + data_definitions, + quantity_weights, + normalization_method ), 'Some data columns were missing from the following runner outputs: ambient_2002: Pod @@ -208,6 +240,8 @@ test_that('Out-of-range times are detected', { independent_arg_names, initial_independent_arg_values, data_definitions, + quantity_weights, + normalization_method, post_process_function = post_process_function ), 'Some observed times were missing from the following runner outputs: @@ -215,3 +249,35 @@ ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', fixed = TRUE ) }) + +test_that('Weights must be supplied for all measured quantities', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values, + data_definitions, + list(), + normalization_method, + post_process_function = post_process_function + ), + 'Weights were not supplied for the following measured quantities: Leaf, Stem, Pod' + ) +}) + +test_that('Bad normalization methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values, + data_definitions, + quantity_weights, + 'bad_normalization_method', + post_process_function = post_process_function + ), + 'Unsupported normalization_method: bad_normalization_method' + ) +}) From 9d4a04f43b4c1c9b6a528f0279338e60973266ed Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 11:21:07 -0500 Subject: [PATCH 07/54] More tests --- R/objective_function.R | 85 ++++++++++++++++-------- tests/testthat/test-objective_function.R | 17 +++++ 2 files changed, 73 insertions(+), 29 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index faadbbf..efb6d8f 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -277,6 +277,22 @@ check_runners <- function( return(invisible(NULL)) } +# Helping function for checking the objective function; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_obj_fun <- function(obj_fun, initial_independent_arg_values) { + initial_error <- obj_fun(as.numeric(initial_independent_arg_values)) + + if (!is.finite(initial_error)) { + stop( + 'The objective function did not return a finite value when using ', + 'the initial argument values; instead, it returned: ', + initial_error + ) + } + + return(invisible(NULL)) +} + # Helping function for getting a "data definition list," which specifies the # names of the `data` columns as they appear in the simulation output get_data_definition_list <- function(data_driver_pairs, user_data_definitions) @@ -450,6 +466,33 @@ error_from_res <- function( penalty + sum(errors) } +# Helping function that forms the overall objective function +get_obj_fun <- function( + model_runners, + long_form_data, + processed_weights, + normalization_method, + extra_penalty_function +) +{ + function(x) { + errors <- sapply(seq_along(model_runners), function(i) { + runner <- model_runners[[i]] + res <- runner(x) + + error_from_res( + res, + long_form_data[[i]], + processed_weights, + normalization_method, + extra_penalty_function + ) + }) + + sum(errors) + } +} + objective_function <- function( model_definition, data_driver_pairs, @@ -508,37 +551,21 @@ objective_function <- function( ) # Process the quantity weights - processed_weights <- process_quantity_weights(quantity_weights, long_form_data) - - # Create and test the total error function - total_error_function <- function(x) { - errors <- sapply(seq_along(model_runners), function(i) { - runner <- model_runners[[i]] - res <- runner(x) - - error_from_res( - res, - long_form_data[[i]], - processed_weights, - normalization_method, - extra_penalty_function - ) - }) - - sum(errors) - } + processed_weights <- + process_quantity_weights(quantity_weights, long_form_data) - initial_error <- - total_error_function(as.numeric(initial_independent_arg_values)) + # Create the objective function + obj_fun <- get_obj_fun( + model_runners, + long_form_data, + processed_weights, + normalization_method, + extra_penalty_function + ) - if (!is.finite(initial_error)) { - stop( - 'The objective function did not return a finite value when using', - 'the initial argument values: ', - initial_error - ) - } + # Check the objective function + check_obj_fun(obj_fun, initial_independent_arg_values) # Return it - total_error_function + obj_fun } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 1cbf885..731156f 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -281,3 +281,20 @@ test_that('Bad normalization methods are detected', { 'Unsupported normalization_method: bad_normalization_method' ) }) + +test_that('Bad return values are detected', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + initial_independent_arg_values, + data_definitions, + quantity_weights, + normalization_method, + post_process_function = post_process_function, + extra_penalty_function = function(x) {NA} + ), + 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' + ) +}) From cad412b76b1e7faac0e087ea8214a8fac0548bfc Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 11:44:48 -0500 Subject: [PATCH 08/54] Add some default values --- R/objective_function.R | 4 +- tests/testthat/test-objective_function.R | 51 ++++++++---------------- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index efb6d8f..de0cbad 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -498,9 +498,9 @@ objective_function <- function( data_driver_pairs, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = list(), + normalization_method = 'mean_max', dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 731156f..75dea53 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -39,8 +39,6 @@ quantity_weights <- list( Pod = 1 ) -normalization_method <- 'mean_max' - # Run tests test_that('An objective function can be created', { obj_fun <- expect_silent( @@ -49,9 +47,8 @@ test_that('An objective function can be created', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ) ) @@ -62,9 +59,8 @@ test_that('An objective function can be created', { ddps[1], independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ) ) @@ -75,9 +71,8 @@ test_that('An objective function can be created', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, dependent_arg_function = dependent_arg_function, post_process_function = post_process_function ) @@ -91,9 +86,8 @@ test_that('Bad definitions are detected', { within(ddps, {ambient_2005$drivers$temp = NULL}), independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'The following drivers did not form a valid dynamical system: ambient_2005' @@ -107,9 +101,8 @@ test_that('Independent argument names must be consistent', { ddps, independent_arg_names, c(), - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), '`independent_arg_names` and `initial_independent_arg_values` must have the same length' @@ -121,9 +114,8 @@ test_that('Independent argument names must be consistent', { ddps, independent_arg_names, list(arg1 = 1, arg2 = 2), - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'The following arguments are included in `initial_independent_arg_values` but not `independent_arg_names`: arg1, arg2' @@ -135,9 +127,8 @@ test_that('Independent argument names must be consistent', { ddps, independent_arg_names, as.numeric(initial_independent_arg_values), - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), '`initial_independent_arg_values` must have names' @@ -151,9 +142,8 @@ test_that('Bad argument names are detected', { ddps, c(independent_arg_names, 'bad_arg_name'), c(initial_independent_arg_values, list(bad_arg_name = 1)), - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'Model runners could not be created for the following drivers: @@ -170,9 +160,8 @@ test_that('Model failures are detected', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'The model could not be run with the following drivers: @@ -189,9 +178,8 @@ test_that('Data-driver pairs must be complete', { within(ddps, {ambient_2002$data = NULL; ambient_2005$drivers = NULL}), independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'The following data-driver pairs are missing a `drivers` element, a `data` element, or both: ambient_2002, ambient_2005' @@ -205,9 +193,8 @@ test_that('Data must have a `time` column', { within(ddps, {ambient_2002$data$time = NULL}), independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'The following data-driver pairs are missing a `time` column in their `data` element: ambient_2002' @@ -221,9 +208,8 @@ test_that('Missing simulation outputs are detected', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method + data_definitions = data_definitions ), 'Some data columns were missing from the following runner outputs: ambient_2002: Pod @@ -239,9 +225,8 @@ test_that('Out-of-range times are detected', { within(ddps, {ambient_2002$data$time <- ambient_2002$data$time + 1e5}), independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'Some observed times were missing from the following runner outputs: @@ -257,9 +242,8 @@ test_that('Weights must be supplied for all measured quantities', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, list(), - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function ), 'Weights were not supplied for the following measured quantities: Leaf, Stem, Pod' @@ -273,9 +257,9 @@ test_that('Bad normalization methods are detected', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - 'bad_normalization_method', + normalization_method = 'bad_normalization_method', + data_definitions = data_definitions, post_process_function = post_process_function ), 'Unsupported normalization_method: bad_normalization_method' @@ -289,9 +273,8 @@ test_that('Bad return values are detected', { ddps, independent_arg_names, initial_independent_arg_values, - data_definitions, quantity_weights, - normalization_method, + data_definitions = data_definitions, post_process_function = post_process_function, extra_penalty_function = function(x) {NA} ), From 6e591333e0e17fb849e6f1fb393fbe5dec3b2f58 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 11:53:34 -0500 Subject: [PATCH 09/54] Rename a few things --- R/objective_function.R | 87 ++++++++++++------------ tests/testthat/test-objective_function.R | 40 +++++------ 2 files changed, 65 insertions(+), 62 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index de0cbad..7ba6b1f 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -1,6 +1,6 @@ # Helping function for checking the data-driver pairs; will throw an error if # a problem is detected, and will otherwise be silent with no return value. -check_data_driver_pairs <- function(model_definition, data_driver_pairs) { +check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { if (is.null(names(data_driver_pairs))) { stop('`data_driver_pairs` must have names') } @@ -39,11 +39,11 @@ check_data_driver_pairs <- function(model_definition, data_driver_pairs) { valid_definitions <- sapply(data_driver_pairs, function(ddp) { BioCro::validate_dynamical_system_inputs( - model_definition[['initial_values']], - model_definition[['parameters']], + base_model_definition[['initial_values']], + base_model_definition[['parameters']], ddp[['drivers']], - model_definition[['direct_modules']], - model_definition[['differential_modules']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], verbose = FALSE ) }) @@ -66,24 +66,27 @@ check_data_driver_pairs <- function(model_definition, data_driver_pairs) { # values check_independent_arguments <- function( independent_arg_names, - initial_independent_arg_values + initial_ind_arg_values ) { - if (length(independent_arg_names) != length(initial_independent_arg_values)) { - stop('`independent_arg_names` and `initial_independent_arg_values` must have the same length') + if (length(independent_arg_names) != length(initial_ind_arg_values)) { + stop( + '`independent_arg_names` and `initial_ind_arg_values` ', + 'must have the same length' + ) } - if (is.null(names(initial_independent_arg_values))) { - stop('`initial_independent_arg_values` must have names') + if (is.null(names(initial_ind_arg_values))) { + stop('`initial_ind_arg_values` must have names') } - if (any(!names(initial_independent_arg_values) %in% independent_arg_names)) { - bad_arg <- !names(initial_independent_arg_values) %in% independent_arg_names + if (any(!names(initial_ind_arg_values) %in% independent_arg_names)) { + bad_arg <- !names(initial_ind_arg_values) %in% independent_arg_names msg <- paste( - 'The following arguments are included in `initial_independent_arg_values`', + 'The following arguments are included in `initial_ind_arg_values`', 'but not `independent_arg_names`:', - paste(names(initial_independent_arg_values)[bad_arg], collapse = ', ') + paste(names(initial_ind_arg_values)[bad_arg], collapse = ', ') ) stop(msg) @@ -95,9 +98,9 @@ check_independent_arguments <- function( # Helping function for getting a model runner; if the runner cannot be created, # an error message will be returned instead get_model_runner <- function( - model_definition, + base_model_definition, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, dependent_arg_function, post_process_function, ddp @@ -108,7 +111,7 @@ get_model_runner <- function( independent_arg_names } else { dependent_arg_values <- - dependent_arg_function(initial_independent_arg_values) + dependent_arg_function(initial_ind_arg_values) c(independent_arg_names, names(dependent_arg_values)) } @@ -116,12 +119,12 @@ get_model_runner <- function( # Build the runner tryCatch({ partial_func <- BioCro::partial_run_biocro( - model_definition[['initial_values']], - model_definition[['parameters']], + base_model_definition[['initial_values']], + base_model_definition[['parameters']], ddp[['drivers']], - model_definition[['direct_modules']], - model_definition[['differential_modules']], - model_definition[['ode_solver']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], + base_model_definition[['ode_solver']], arg_names ) @@ -133,10 +136,10 @@ get_model_runner <- function( x_for_partial <- if (is.null(dependent_arg_function)) { x } else { - x_for_dependent_arg_func <- + x_for_dep_arg_func <- stats::setNames(x, independent_arg_names) - c(x, as.numeric(dependent_arg_function(x_for_dependent_arg_func))) + c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) } initial_res <- partial_func(x_for_partial) @@ -157,7 +160,7 @@ get_model_runner <- function( check_runners <- function( data_driver_pairs, model_runners, - initial_independent_arg_values, + initial_ind_arg_values, full_data_definitions ) { @@ -179,7 +182,7 @@ check_runners <- function( # Now check for runners that cannot be evaluated runner_eval_msg <- sapply(model_runners, function(runner) { runner_result <- tryCatch( - runner(as.numeric(initial_independent_arg_values)), + runner(as.numeric(initial_ind_arg_values)), error = function(e) {as.character(e)} ) @@ -204,7 +207,7 @@ check_runners <- function( # Run each runner runner_results <- lapply(model_runners, function(runner) { - runner(as.numeric(initial_independent_arg_values)) + runner(as.numeric(initial_ind_arg_values)) }) # Now make sure each runner produces the necessary columns in its output @@ -219,7 +222,7 @@ check_runners <- function( }) if (any(bad_outputs)) { - msg <- 'Some data columns were missing from the following runner outputs:' + msg <- 'Some data columns were missing from runner outputs:' for (i in seq_along(bad_outputs)) { if (bad_outputs[i]) { @@ -257,7 +260,7 @@ check_runners <- function( }) if (any(bad_times)) { - msg <- 'Some observed times were missing from the following runner outputs:' + msg <- 'Some observed times were missing from runner outputs:' for (i in seq_along(bad_times)) { if (bad_times[i]) { @@ -279,8 +282,8 @@ check_runners <- function( # Helping function for checking the objective function; will throw an error if a # problem is detected, and will otherwise be silent with no return value. -check_obj_fun <- function(obj_fun, initial_independent_arg_values) { - initial_error <- obj_fun(as.numeric(initial_independent_arg_values)) +check_obj_fun <- function(obj_fun, initial_ind_arg_values) { + initial_error <- obj_fun(as.numeric(initial_ind_arg_values)) if (!is.finite(initial_error)) { stop( @@ -348,13 +351,13 @@ get_long_form_data <- function(data_driver_pairs, full_data_definitions) { # Helping function for getting time indices add_time_indices <- function( model_runners, - initial_independent_arg_values, + initial_ind_arg_values, long_form_data ) { for (i in seq_along(long_form_data)) { runner <- model_runners[[i]] - res <- runner(as.numeric(initial_independent_arg_values)) + res <- runner(as.numeric(initial_ind_arg_values)) dataf <- long_form_data[[i]] @@ -494,10 +497,10 @@ get_obj_fun <- function( } objective_function <- function( - model_definition, + base_model_definition, data_driver_pairs, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = list(), normalization_method = 'mean_max', @@ -507,20 +510,20 @@ objective_function <- function( ) { # Check the data-driver pairs - check_data_driver_pairs(model_definition, data_driver_pairs) + check_data_driver_pairs(base_model_definition, data_driver_pairs) # Check the independent arguments check_independent_arguments( independent_arg_names, - initial_independent_arg_values + initial_ind_arg_values ) # Get the model runners model_runners <- lapply(data_driver_pairs, function(ddp) { get_model_runner( - model_definition, + base_model_definition, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, dependent_arg_function, post_process_function, ddp @@ -535,7 +538,7 @@ objective_function <- function( check_runners( data_driver_pairs, model_runners, - initial_independent_arg_values, + initial_ind_arg_values, full_data_definitions ) @@ -546,7 +549,7 @@ objective_function <- function( # Find indices corresponding to the measured time points long_form_data <- add_time_indices( model_runners, - initial_independent_arg_values, + initial_ind_arg_values, long_form_data ) @@ -564,7 +567,7 @@ objective_function <- function( ) # Check the objective function - check_obj_fun(obj_fun, initial_independent_arg_values) + check_obj_fun(obj_fun, initial_ind_arg_values) # Return it obj_fun diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 75dea53..bd513dd 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -15,7 +15,7 @@ ddps <- list( independent_arg_names <- c('alphaLeaf', 'betaLeaf') -initial_independent_arg_values <- with(BioCro::soybean[['parameters']], { +initial_ind_arg_values <- with(BioCro::soybean[['parameters']], { list(alphaLeaf = alphaLeaf, betaLeaf = betaLeaf) }) @@ -46,7 +46,7 @@ test_that('An objective function can be created', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -58,7 +58,7 @@ test_that('An objective function can be created', { model, ddps[1], independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -70,7 +70,7 @@ test_that('An objective function can be created', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, dependent_arg_function = dependent_arg_function, @@ -85,7 +85,7 @@ test_that('Bad definitions are detected', { model, within(ddps, {ambient_2005$drivers$temp = NULL}), independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -105,7 +105,7 @@ test_that('Independent argument names must be consistent', { data_definitions = data_definitions, post_process_function = post_process_function ), - '`independent_arg_names` and `initial_independent_arg_values` must have the same length' + '`independent_arg_names` and `initial_ind_arg_values` must have the same length' ) expect_error( @@ -118,7 +118,7 @@ test_that('Independent argument names must be consistent', { data_definitions = data_definitions, post_process_function = post_process_function ), - 'The following arguments are included in `initial_independent_arg_values` but not `independent_arg_names`: arg1, arg2' + 'The following arguments are included in `initial_ind_arg_values` but not `independent_arg_names`: arg1, arg2' ) expect_error( @@ -126,12 +126,12 @@ test_that('Independent argument names must be consistent', { model, ddps, independent_arg_names, - as.numeric(initial_independent_arg_values), + as.numeric(initial_ind_arg_values), quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function ), - '`initial_independent_arg_values` must have names' + '`initial_ind_arg_values` must have names' ) }) @@ -141,7 +141,7 @@ test_that('Bad argument names are detected', { model, ddps, c(independent_arg_names, 'bad_arg_name'), - c(initial_independent_arg_values, list(bad_arg_name = 1)), + c(initial_ind_arg_values, list(bad_arg_name = 1)), quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -159,7 +159,7 @@ test_that('Model failures are detected', { within(model, {parameters$lnfun = 1}), ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -177,7 +177,7 @@ test_that('Data-driver pairs must be complete', { model, within(ddps, {ambient_2002$data = NULL; ambient_2005$drivers = NULL}), independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -192,7 +192,7 @@ test_that('Data must have a `time` column', { model, within(ddps, {ambient_2002$data$time = NULL}), independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -207,11 +207,11 @@ test_that('Missing simulation outputs are detected', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions ), - 'Some data columns were missing from the following runner outputs: + 'Some data columns were missing from runner outputs: ambient_2002: Pod ambient_2005: Pod', fixed = TRUE @@ -224,12 +224,12 @@ test_that('Out-of-range times are detected', { model, within(ddps, {ambient_2002$data$time <- ambient_2002$data$time + 1e5}), independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function ), - 'Some observed times were missing from the following runner outputs: + 'Some observed times were missing from runner outputs: ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', fixed = TRUE ) @@ -241,7 +241,7 @@ test_that('Weights must be supplied for all measured quantities', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, list(), data_definitions = data_definitions, post_process_function = post_process_function @@ -256,7 +256,7 @@ test_that('Bad normalization methods are detected', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, normalization_method = 'bad_normalization_method', data_definitions = data_definitions, @@ -272,7 +272,7 @@ test_that('Bad return values are detected', { model, ddps, independent_arg_names, - initial_independent_arg_values, + initial_ind_arg_values, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, From df77e58fc94d58e00bfa16c822a843146c61d43f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 12:16:47 -0500 Subject: [PATCH 10/54] Be more efficient with normalization --- R/objective_function.R | 48 +++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 7ba6b1f..2a39258 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -373,6 +373,32 @@ add_time_indices <- function( long_form_data } +# Helping function for getting normalization factors +add_norm <- function(long_form_data, normalization_method) { + for (i in seq_along(long_form_data)) { + data_table <- long_form_data[[i]] + + data_table[['norm']] <- sapply(seq_len(nrow(data_table)), function(j) { + qname <- data_table[j, 'quantity_name'] + + qname_subset <- + data_table[data_table[['quantity_name']] == qname, ] + + if (tolower(normalization_method) == 'mean_max') { + npts <- nrow(qname_subset) + qmax <- max(qname_subset[['quantity_value']]) + npts * qmax^2 + } else { + stop('Unsupported normalization_method: ', normalization_method) + } + }) + + long_form_data[[i]] <- data_table + } + + long_form_data +} + # Helping function that processes and checks the quantity weights process_quantity_weights <- function(quantity_weights, long_form_data) { # First make sure that weights have been provided for all measured @@ -406,23 +432,12 @@ process_quantity_weights <- function(quantity_weights, long_form_data) { }) } -# Helping function that calculates one normalization factor -one_norm <- function(long_form_data_table, qname, normalization_method) { - if (tolower(normalization_method) == 'mean_max') { - npts <- sum(long_form_data_table[, 'quantity_name'] == qname) - qmax <- max(long_form_data_table[long_form_data_table[, 'quantity_name'] == qname, 'quantity_value']) - npts * qmax^2 - } else { - stop('Unsupported normalization_method: ', normalization_method) - } -} - # Helping function that calculates one error one_error <- function(observed, predicted, weight, normalization) { - weight_multiplier <- if (observed <= predicted) { - weight[1] + weight_multiplier <- if (predicted < observed) { + weight[1] # Underprediction } else { - weight[2] + weight[2] # Overprediction } (observed - predicted)^2 * weight_multiplier / normalization @@ -460,7 +475,7 @@ error_from_res <- function( indx <- long_form_data_table[i, 'time_index'] pred <- simulation_result[indx, qname] wt <- quantity_weights[[qname]] - norm <- one_norm(long_form_data_table, qname, normalization_method) + norm <- long_form_data_table[i, 'norm'] one_error(obs, pred, wt, norm) }) @@ -553,6 +568,9 @@ objective_function <- function( long_form_data ) + # Add normalization factors + long_form_data <- add_norm(long_form_data, normalization_method) + # Process the quantity weights processed_weights <- process_quantity_weights(quantity_weights, long_form_data) From edfb96562da412f7f5b3fa8c4fa57bc038d77611 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 13:05:19 -0500 Subject: [PATCH 11/54] Be more efficient with initial model runner evaluations --- R/objective_function.R | 91 ++++++++++++------------ tests/testthat/test-objective_function.R | 26 ++++++- 2 files changed, 72 insertions(+), 45 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 2a39258..7b1a155 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -157,14 +157,7 @@ get_model_runner <- function( # Helping function for checking the model runners; will throw an error if a # problem is detected, and will otherwise be silent with no return value. -check_runners <- function( - data_driver_pairs, - model_runners, - initial_ind_arg_values, - full_data_definitions -) -{ - # First check for runners that could not be created +check_runners <- function(model_runners) { bad_runners <- sapply(model_runners, is.character) if (any(bad_runners)) { @@ -179,23 +172,24 @@ check_runners <- function( stop(msg) } - # Now check for runners that cannot be evaluated - runner_eval_msg <- sapply(model_runners, function(runner) { - runner_result <- tryCatch( - runner(as.numeric(initial_ind_arg_values)), - error = function(e) {as.character(e)} - ) + return(invisible(NULL)) +} - if (is.character(runner_result)) { - runner_result - } else { - '' - } - }) +# Helping function for checking the initial model runner results; will throw an +# error if a problem is detected, and will otherwise be silent with no return +# value. +check_runner_results <- function( + initial_runner_res, + full_data_definitions, + data_driver_pairs +) +{ + # Check for runners that could not be evaluated + bad_res <- sapply(initial_runner_res, is.character) - if (any(runner_eval_msg != '')) { - bad_runner_names <- names(model_runners)[runner_eval_msg != ''] - bad_runner_msg <- runner_eval_msg[runner_eval_msg != ''] + if (any(bad_res)) { + bad_runner_names <- names(initial_runner_res)[bad_res] + bad_runner_msg <- initial_runner_res[bad_res] msg <- paste0( 'The model could not be run with the following drivers:\n', @@ -205,15 +199,10 @@ check_runners <- function( stop(msg) } - # Run each runner - runner_results <- lapply(model_runners, function(runner) { - runner(as.numeric(initial_ind_arg_values)) - }) - - # Now make sure each runner produces the necessary columns in its output + # Make sure each runner produces the necessary columns in its output expected_columns <- as.character(full_data_definitions) - missing_columns <- lapply(runner_results, function(res) { + missing_columns <- lapply(initial_runner_res, function(res) { expected_columns[!expected_columns %in% colnames(res)] }) @@ -229,7 +218,7 @@ check_runners <- function( msg <- append( msg, paste0( - names(model_runners)[i], ': ', + names(initial_runner_res)[i], ': ', paste(missing_columns[[i]], collapse = ', ') ) ) @@ -240,8 +229,8 @@ check_runners <- function( } # Make sure the output from each runner includes the observed times - times_out_of_range <- lapply(seq_along(runner_results), function(i) { - res <- runner_results[[i]] + times_out_of_range <- lapply(seq_along(initial_runner_res), function(i) { + res <- initial_runner_res[[i]] min_time <- min(res[['time']]) max_time <- max(res[['time']]) @@ -267,7 +256,7 @@ check_runners <- function( msg <- append( msg, paste0( - names(model_runners)[i], ': ', + names(initial_runner_res)[i], ': ', paste(times_out_of_range[[i]], collapse = ', ') ) ) @@ -280,6 +269,16 @@ check_runners <- function( return(invisible(NULL)) } +# Helping function for running each runner with the initial argument values +get_initial_runner_res <- function(model_runners, initial_ind_arg_values) { + lapply(model_runners, function(runner) { + tryCatch( + runner(as.numeric(initial_ind_arg_values)), + error = function(e) {as.character(e)} + ) + }) +} + # Helping function for checking the objective function; will throw an error if a # problem is detected, and will otherwise be silent with no return value. check_obj_fun <- function(obj_fun, initial_ind_arg_values) { @@ -350,15 +349,13 @@ get_long_form_data <- function(data_driver_pairs, full_data_definitions) { # Helping function for getting time indices add_time_indices <- function( - model_runners, + initial_runner_res, initial_ind_arg_values, long_form_data ) { for (i in seq_along(long_form_data)) { - runner <- model_runners[[i]] - res <- runner(as.numeric(initial_ind_arg_values)) - + res <- initial_runner_res[[i]] dataf <- long_form_data[[i]] indices <- sapply(dataf[, 'time'], function(x) { @@ -550,11 +547,17 @@ objective_function <- function( get_data_definition_list(data_driver_pairs, data_definitions) # Check the model runners - check_runners( - data_driver_pairs, - model_runners, - initial_ind_arg_values, - full_data_definitions + check_runners(model_runners) + + # Get initial model runner results + initial_runner_res <- + get_initial_runner_res(model_runners, initial_ind_arg_values) + + # Check the initial model runner results + check_runner_results( + initial_runner_res, + full_data_definitions, + data_driver_pairs ) # Get the long-form data @@ -563,7 +566,7 @@ objective_function <- function( # Find indices corresponding to the measured time points long_form_data <- add_time_indices( - model_runners, + initial_runner_res, initial_ind_arg_values, long_form_data ) diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index bd513dd..a7e83b6 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -1,3 +1,6 @@ +# Specify test settings +TOLERANCE <- 1e-6 + # Specify key inputs to use for these tests model <- BioCro::soybean model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] @@ -40,7 +43,8 @@ quantity_weights <- list( ) # Run tests -test_that('An objective function can be created', { +test_that('Objective functions can be created and behave as expected', { + # Two data-driver pairs, no dependent arguments obj_fun <- expect_silent( objective_function( model, @@ -53,6 +57,13 @@ test_that('An objective function can be created', { ) ) + expect_equal( + obj_fun(as.numeric(initial_ind_arg_values)), + 0.06316674, + tolerance = TOLERANCE + ) + + # One data-driver pair, no dependent arguments obj_fun <- expect_silent( objective_function( model, @@ -65,6 +76,13 @@ test_that('An objective function can be created', { ) ) + expect_equal( + obj_fun(as.numeric(initial_ind_arg_values)), + 0.02607073, + tolerance = TOLERANCE + ) + + # Two data-driver pairs, with dependent arguments obj_fun <- expect_silent( objective_function( model, @@ -77,6 +95,12 @@ test_that('An objective function can be created', { post_process_function = post_process_function ) ) + + expect_equal( + obj_fun(as.numeric(initial_ind_arg_values)), + 0.06686873, + tolerance = TOLERANCE + ) }) test_that('Bad definitions are detected', { From 40cfead108262a2c2f60064dcdc54bc4ec00acb8 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 13:44:18 -0500 Subject: [PATCH 12/54] Add regularization --- R/objective_function.R | 31 +++++++++++++++--- tests/testthat/test-objective_function.R | 41 +++++++++++++----------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 7b1a155..36e130d 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -481,16 +481,35 @@ error_from_res <- function( penalty + sum(errors) } +# Helping function for calculating a regularization penalty term +regularization_penalty <- function( + ind_arg_vals, + regularization_method, + regularization_lambda +) +{ + if (toupper(regularization_method) == 'NONE') { + 0.0 + } else if (toupper(regularization_method) == 'LASSO' || toupper(regularization_method) == 'L1') { + regularization_lambda * sum(abs(ind_arg_vals)) + } else if (toupper(regularization_method) == 'RIDGE' || toupper(regularization_method) == 'L2') { + regularization_lambda * sum(ind_arg_vals^2) + } else { + stop('Unsupported regularization method: ', regularization_method) + } +} + # Helping function that forms the overall objective function get_obj_fun <- function( model_runners, long_form_data, processed_weights, normalization_method, - extra_penalty_function + extra_penalty_function, + regularization_method ) { - function(x) { + function(x, lambda = 0) { errors <- sapply(seq_along(model_runners), function(i) { runner <- model_runners[[i]] res <- runner(x) @@ -504,6 +523,8 @@ get_obj_fun <- function( ) }) + reg_penalty <- regularization_penalty(x, regularization_method, lambda) + sum(errors) } } @@ -518,7 +539,8 @@ objective_function <- function( normalization_method = 'mean_max', dependent_arg_function = NULL, post_process_function = NULL, - extra_penalty_function = NULL + extra_penalty_function = NULL, + regularization_method = 'none' ) { # Check the data-driver pairs @@ -584,7 +606,8 @@ objective_function <- function( long_form_data, processed_weights, normalization_method, - extra_penalty_function + extra_penalty_function, + regularization_method ) # Check the objective function diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index a7e83b6..73a480c 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -1,6 +1,3 @@ -# Specify test settings -TOLERANCE <- 1e-6 - # Specify key inputs to use for these tests model <- BioCro::soybean model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] @@ -57,10 +54,12 @@ test_that('Objective functions can be created and behave as expected', { ) ) - expect_equal( - obj_fun(as.numeric(initial_ind_arg_values)), - 0.06316674, - tolerance = TOLERANCE + expect_silent( + obj_fun(as.numeric(initial_ind_arg_values)) + ) + + expect_silent( + obj_fun(as.numeric(initial_ind_arg_values), 0.5) ) # One data-driver pair, no dependent arguments @@ -76,12 +75,6 @@ test_that('Objective functions can be created and behave as expected', { ) ) - expect_equal( - obj_fun(as.numeric(initial_ind_arg_values)), - 0.02607073, - tolerance = TOLERANCE - ) - # Two data-driver pairs, with dependent arguments obj_fun <- expect_silent( objective_function( @@ -95,12 +88,6 @@ test_that('Objective functions can be created and behave as expected', { post_process_function = post_process_function ) ) - - expect_equal( - obj_fun(as.numeric(initial_ind_arg_values)), - 0.06686873, - tolerance = TOLERANCE - ) }) test_that('Bad definitions are detected', { @@ -305,3 +292,19 @@ test_that('Bad return values are detected', { 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) }) + +test_that('Bad regularization methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_arg_names, + initial_ind_arg_values, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + regularization_method = 'bad_regularization_method' + ), + 'Unsupported regularization method: bad_regularization_method' + ) +}) From a0148373a3d44b49a7e58bfb436d868795b8361f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Sat, 3 May 2025 14:58:23 -0500 Subject: [PATCH 13/54] Reorganize file structure --- R/objective_function.R | 532 +--------------------------- R/objective_function_helpers.R | 303 ++++++++++++++++ R/objective_function_input_checks.R | 226 ++++++++++++ 3 files changed, 531 insertions(+), 530 deletions(-) create mode 100644 R/objective_function_helpers.R create mode 100644 R/objective_function_input_checks.R diff --git a/R/objective_function.R b/R/objective_function.R index 36e130d..1fc3380 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -1,533 +1,5 @@ -# Helping function for checking the data-driver pairs; will throw an error if -# a problem is detected, and will otherwise be silent with no return value. -check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { - if (is.null(names(data_driver_pairs))) { - stop('`data_driver_pairs` must have names') - } - - has_elements <- sapply(data_driver_pairs, function(x) { - 'drivers' %in% names(x) && 'data' %in% names(x) - }) - - if (any(!has_elements)) { - missing_elements <- names(data_driver_pairs)[!has_elements] - - msg <- paste( - 'The following data-driver pairs are missing a `drivers` element,', - 'a `data` element, or both:', - paste(missing_elements, collapse = ', ') - ) - - stop(msg) - } - - has_time <- sapply(data_driver_pairs, function(x) { - 'time' %in% colnames(x[['data']]) - }) - - if (any(!has_time)) { - missing_time <- names(data_driver_pairs)[!has_time] - - msg <- paste( - 'The following data-driver pairs are missing a `time` column', - 'in their `data` element:', - paste(missing_time, collapse = ', ') - ) - - stop(msg) - } - - valid_definitions <- sapply(data_driver_pairs, function(ddp) { - BioCro::validate_dynamical_system_inputs( - base_model_definition[['initial_values']], - base_model_definition[['parameters']], - ddp[['drivers']], - base_model_definition[['direct_modules']], - base_model_definition[['differential_modules']], - verbose = FALSE - ) - }) - - if (any(!valid_definitions)) { - invalid_ddp <- names(data_driver_pairs)[!valid_definitions] - - msg <- paste( - 'The following drivers did not form a valid dynamical system:', - paste(invalid_ddp, collapse = ', ') - ) - - stop(msg) - } - - return(invisible(NULL)) -} - -# Helping function for checking the independent argument names and initial -# values -check_independent_arguments <- function( - independent_arg_names, - initial_ind_arg_values -) -{ - if (length(independent_arg_names) != length(initial_ind_arg_values)) { - stop( - '`independent_arg_names` and `initial_ind_arg_values` ', - 'must have the same length' - ) - } - - if (is.null(names(initial_ind_arg_values))) { - stop('`initial_ind_arg_values` must have names') - } - - if (any(!names(initial_ind_arg_values) %in% independent_arg_names)) { - bad_arg <- !names(initial_ind_arg_values) %in% independent_arg_names - - msg <- paste( - 'The following arguments are included in `initial_ind_arg_values`', - 'but not `independent_arg_names`:', - paste(names(initial_ind_arg_values)[bad_arg], collapse = ', ') - ) - - stop(msg) - } - - return(invisible(NULL)) -} - -# Helping function for getting a model runner; if the runner cannot be created, -# an error message will be returned instead -get_model_runner <- function( - base_model_definition, - independent_arg_names, - initial_ind_arg_values, - dependent_arg_function, - post_process_function, - ddp -) -{ - # Get the full list of arg_names - arg_names <- if (is.null(dependent_arg_function)) { - independent_arg_names - } else { - dependent_arg_values <- - dependent_arg_function(initial_ind_arg_values) - - c(independent_arg_names, names(dependent_arg_values)) - } - - # Build the runner - tryCatch({ - partial_func <- BioCro::partial_run_biocro( - base_model_definition[['initial_values']], - base_model_definition[['parameters']], - ddp[['drivers']], - base_model_definition[['direct_modules']], - base_model_definition[['differential_modules']], - base_model_definition[['ode_solver']], - arg_names - ) - - function(x) { - if (!is.numeric(x)) { - stop('`x` must be numeric') - } - - x_for_partial <- if (is.null(dependent_arg_function)) { - x - } else { - x_for_dep_arg_func <- - stats::setNames(x, independent_arg_names) - - c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) - } - - initial_res <- partial_func(x_for_partial) - - if (is.null(post_process_function)) { - initial_res - } else { - post_process_function(initial_res) - } - } - }, - error = function(e) {as.character(e)} - ) -} - -# Helping function for checking the model runners; will throw an error if a -# problem is detected, and will otherwise be silent with no return value. -check_runners <- function(model_runners) { - bad_runners <- sapply(model_runners, is.character) - - if (any(bad_runners)) { - bad_runner_names <- names(model_runners)[bad_runners] - bad_runner_msg <- as.character(model_runners[bad_runners]) - - msg <- paste0( - 'Model runners could not be created for the following drivers:\n', - paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') - ) - - stop(msg) - } - - return(invisible(NULL)) -} - -# Helping function for checking the initial model runner results; will throw an -# error if a problem is detected, and will otherwise be silent with no return -# value. -check_runner_results <- function( - initial_runner_res, - full_data_definitions, - data_driver_pairs -) -{ - # Check for runners that could not be evaluated - bad_res <- sapply(initial_runner_res, is.character) - - if (any(bad_res)) { - bad_runner_names <- names(initial_runner_res)[bad_res] - bad_runner_msg <- initial_runner_res[bad_res] - - msg <- paste0( - 'The model could not be run with the following drivers:\n', - paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') - ) - - stop(msg) - } - - # Make sure each runner produces the necessary columns in its output - expected_columns <- as.character(full_data_definitions) - - missing_columns <- lapply(initial_runner_res, function(res) { - expected_columns[!expected_columns %in% colnames(res)] - }) - - bad_outputs <- sapply(missing_columns, function(x) { - length(x) > 0 - }) - - if (any(bad_outputs)) { - msg <- 'Some data columns were missing from runner outputs:' - - for (i in seq_along(bad_outputs)) { - if (bad_outputs[i]) { - msg <- append( - msg, - paste0( - names(initial_runner_res)[i], ': ', - paste(missing_columns[[i]], collapse = ', ') - ) - ) - } - } - - stop(paste(msg, collapse = '\n')) - } - - # Make sure the output from each runner includes the observed times - times_out_of_range <- lapply(seq_along(initial_runner_res), function(i) { - res <- initial_runner_res[[i]] - - min_time <- min(res[['time']]) - max_time <- max(res[['time']]) - - data_times <- data_driver_pairs[[i]][['data']][['time']] - - oor <- sapply(data_times, function(datat) { - datat < min_time || datat > max_time - }) - - data_times[oor] - }) - - bad_times <- sapply(times_out_of_range, function(x) { - length(x) > 0 - }) - - if (any(bad_times)) { - msg <- 'Some observed times were missing from runner outputs:' - - for (i in seq_along(bad_times)) { - if (bad_times[i]) { - msg <- append( - msg, - paste0( - names(initial_runner_res)[i], ': ', - paste(times_out_of_range[[i]], collapse = ', ') - ) - ) - } - } - - stop(paste(msg, collapse = '\n')) - } - - return(invisible(NULL)) -} - -# Helping function for running each runner with the initial argument values -get_initial_runner_res <- function(model_runners, initial_ind_arg_values) { - lapply(model_runners, function(runner) { - tryCatch( - runner(as.numeric(initial_ind_arg_values)), - error = function(e) {as.character(e)} - ) - }) -} - -# Helping function for checking the objective function; will throw an error if a -# problem is detected, and will otherwise be silent with no return value. -check_obj_fun <- function(obj_fun, initial_ind_arg_values) { - initial_error <- obj_fun(as.numeric(initial_ind_arg_values)) - - if (!is.finite(initial_error)) { - stop( - 'The objective function did not return a finite value when using ', - 'the initial argument values; instead, it returned: ', - initial_error - ) - } - - return(invisible(NULL)) -} - -# Helping function for getting a "data definition list," which specifies the -# names of the `data` columns as they appear in the simulation output -get_data_definition_list <- function(data_driver_pairs, user_data_definitions) -{ - # First get all the column names found in the observed data - all_data_colnames <- - lapply(data_driver_pairs, function(x) {colnames(x[['data']])}) - - all_data_colnames <- unlist(all_data_colnames) - - all_data_colnames <- all_data_colnames[!duplicated(all_data_colnames)] - - # Remove the `time` column - all_data_colnames <- all_data_colnames[all_data_colnames != 'time'] - - # Build the data definition list - data_definitions <- lapply(all_data_colnames, function(cn) { - if (cn %in% names(user_data_definitions)) { - user_data_definitions[[cn]] - } else { - cn - } - }) - - names(data_definitions) <- all_data_colnames - - data_definitions -} - -# Helping function for converting each data table to a "long form." -get_long_form_data <- function(data_driver_pairs, full_data_definitions) { - lapply(data_driver_pairs, function(ddp) { - short_form_data <- ddp[['data']] - - data_column_names <- colnames(short_form_data) - data_column_names <- data_column_names[data_column_names != 'time'] - - long_form_data_list <- lapply(data_column_names, function(cn) { - data.frame( - time = short_form_data[, 'time'], - quantity_name = full_data_definitions[[cn]], - quantity_value = short_form_data[, cn], - stringsAsFactors = FALSE - ) - }) - - long_form_data <- do.call(rbind, long_form_data_list) - - long_form_data[!is.na(long_form_data[['quantity_value']]), ] - }) -} - -# Helping function for getting time indices -add_time_indices <- function( - initial_runner_res, - initial_ind_arg_values, - long_form_data -) -{ - for (i in seq_along(long_form_data)) { - res <- initial_runner_res[[i]] - dataf <- long_form_data[[i]] - - indices <- sapply(dataf[, 'time'], function(x) { - tdiff <- abs(res[, 'time'] - x) - which(tdiff == min(tdiff)) - }) - - long_form_data[[i]][, 'time_index'] <- indices - long_form_data[[i]][, 'expected_npts'] <- nrow(res) - } - - long_form_data -} - -# Helping function for getting normalization factors -add_norm <- function(long_form_data, normalization_method) { - for (i in seq_along(long_form_data)) { - data_table <- long_form_data[[i]] - - data_table[['norm']] <- sapply(seq_len(nrow(data_table)), function(j) { - qname <- data_table[j, 'quantity_name'] - - qname_subset <- - data_table[data_table[['quantity_name']] == qname, ] - - if (tolower(normalization_method) == 'mean_max') { - npts <- nrow(qname_subset) - qmax <- max(qname_subset[['quantity_value']]) - npts * qmax^2 - } else { - stop('Unsupported normalization_method: ', normalization_method) - } - }) - - long_form_data[[i]] <- data_table - } - - long_form_data -} - -# Helping function that processes and checks the quantity weights -process_quantity_weights <- function(quantity_weights, long_form_data) { - # First make sure that weights have been provided for all measured - # quantities - all_data_colnames <- lapply(long_form_data, function(x) { - unique(x[, 'quantity_name']) - }) - - all_data_colnames <- unlist(all_data_colnames) - - all_data_colnames <- unique(all_data_colnames) - - weight_was_supplied <- sapply(all_data_colnames, function(cn) { - cn %in% names(quantity_weights) - }) - - if (any(!weight_was_supplied)) { - missing_weights <- all_data_colnames[!weight_was_supplied] - - msg <- paste( - 'Weights were not supplied for the following measured quantities:', - paste(missing_weights, collapse = ', ') - ) - - stop(msg) - } - - # Now make sure all the weights have length 2 - lapply(quantity_weights, function(wt) { - rep_len(wt, 2) - }) -} - -# Helping function that calculates one error -one_error <- function(observed, predicted, weight, normalization) { - weight_multiplier <- if (predicted < observed) { - weight[1] # Underprediction - } else { - weight[2] # Overprediction - } - - (observed - predicted)^2 * weight_multiplier / normalization -} - -# Helping function that calculates an error value from a simulation result -error_from_res <- function( - simulation_result, - long_form_data_table, - quantity_weights, - normalization_method, - extra_penalty_function -) -{ - # If the simulation did not finish, return a very high value - expected_npts <- long_form_data_table[1, 'expected_npts'] - - if (nrow(simulation_result) < expected_npts) { - return(1e6) - } - - # Calculate any user-specified penalties - penalty <- if (is.null(extra_penalty_function)) { - 0.0 - } else { - extra_penalty_function(simulation_result) - } - - # Calculate the error terms - n_obs <- nrow(long_form_data_table) - - errors <- sapply(seq_len(n_obs), function(i) { - qname <- as.character(long_form_data_table[i, 'quantity_name']) - obs <- long_form_data_table[i, 'quantity_value'] - indx <- long_form_data_table[i, 'time_index'] - pred <- simulation_result[indx, qname] - wt <- quantity_weights[[qname]] - norm <- long_form_data_table[i, 'norm'] - - one_error(obs, pred, wt, norm) - }) - - # Return the sum of the penalty and error terms - penalty + sum(errors) -} - -# Helping function for calculating a regularization penalty term -regularization_penalty <- function( - ind_arg_vals, - regularization_method, - regularization_lambda -) -{ - if (toupper(regularization_method) == 'NONE') { - 0.0 - } else if (toupper(regularization_method) == 'LASSO' || toupper(regularization_method) == 'L1') { - regularization_lambda * sum(abs(ind_arg_vals)) - } else if (toupper(regularization_method) == 'RIDGE' || toupper(regularization_method) == 'L2') { - regularization_lambda * sum(ind_arg_vals^2) - } else { - stop('Unsupported regularization method: ', regularization_method) - } -} - -# Helping function that forms the overall objective function -get_obj_fun <- function( - model_runners, - long_form_data, - processed_weights, - normalization_method, - extra_penalty_function, - regularization_method -) -{ - function(x, lambda = 0) { - errors <- sapply(seq_along(model_runners), function(i) { - runner <- model_runners[[i]] - res <- runner(x) - - error_from_res( - res, - long_form_data[[i]], - processed_weights, - normalization_method, - extra_penalty_function - ) - }) - - reg_penalty <- regularization_penalty(x, regularization_method, lambda) - - sum(errors) - } -} +# Here we use several functions that are defined in +# `objective_function_input_checks.R` and `objective_function_helpers.R` objective_function <- function( base_model_definition, diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R new file mode 100644 index 0000000..a787d64 --- /dev/null +++ b/R/objective_function_helpers.R @@ -0,0 +1,303 @@ +# Helping function for getting a model runner; if the runner cannot be created, +# an error message will be returned instead +get_model_runner <- function( + base_model_definition, + independent_arg_names, + initial_ind_arg_values, + dependent_arg_function, + post_process_function, + ddp +) +{ + # Get the full list of arg_names + arg_names <- if (is.null(dependent_arg_function)) { + independent_arg_names + } else { + dependent_arg_values <- + dependent_arg_function(initial_ind_arg_values) + + c(independent_arg_names, names(dependent_arg_values)) + } + + # Build the runner + tryCatch({ + partial_func <- BioCro::partial_run_biocro( + base_model_definition[['initial_values']], + base_model_definition[['parameters']], + ddp[['drivers']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], + base_model_definition[['ode_solver']], + arg_names + ) + + function(x) { + if (!is.numeric(x)) { + stop('`x` must be numeric') + } + + x_for_partial <- if (is.null(dependent_arg_function)) { + x + } else { + x_for_dep_arg_func <- + stats::setNames(x, independent_arg_names) + + c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) + } + + initial_res <- partial_func(x_for_partial) + + if (is.null(post_process_function)) { + initial_res + } else { + post_process_function(initial_res) + } + } + }, + error = function(e) {as.character(e)} + ) +} + +# Helping function for running each runner with the initial argument values +get_initial_runner_res <- function(model_runners, initial_ind_arg_values) { + lapply(model_runners, function(runner) { + tryCatch( + runner(as.numeric(initial_ind_arg_values)), + error = function(e) {as.character(e)} + ) + }) +} + +# Helping function for getting a "data definition list," which specifies the +# names of the `data` columns as they appear in the simulation output +get_data_definition_list <- function(data_driver_pairs, user_data_definitions) +{ + # First get all the column names found in the observed data + all_data_colnames <- + lapply(data_driver_pairs, function(x) {colnames(x[['data']])}) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- all_data_colnames[!duplicated(all_data_colnames)] + + # Remove the `time` column + all_data_colnames <- all_data_colnames[all_data_colnames != 'time'] + + # Build the data definition list + data_definitions <- lapply(all_data_colnames, function(cn) { + if (cn %in% names(user_data_definitions)) { + user_data_definitions[[cn]] + } else { + cn + } + }) + + names(data_definitions) <- all_data_colnames + + data_definitions +} + +# Helping function for converting each data table to a "long form." +get_long_form_data <- function(data_driver_pairs, full_data_definitions) { + lapply(data_driver_pairs, function(ddp) { + short_form_data <- ddp[['data']] + + data_column_names <- colnames(short_form_data) + data_column_names <- data_column_names[data_column_names != 'time'] + + long_form_data_list <- lapply(data_column_names, function(cn) { + data.frame( + time = short_form_data[, 'time'], + quantity_name = full_data_definitions[[cn]], + quantity_value = short_form_data[, cn], + stringsAsFactors = FALSE + ) + }) + + long_form_data <- do.call(rbind, long_form_data_list) + + long_form_data[!is.na(long_form_data[['quantity_value']]), ] + }) +} + +# Helping function for getting time indices +add_time_indices <- function( + initial_runner_res, + initial_ind_arg_values, + long_form_data +) +{ + for (i in seq_along(long_form_data)) { + res <- initial_runner_res[[i]] + dataf <- long_form_data[[i]] + + indices <- sapply(dataf[, 'time'], function(x) { + tdiff <- abs(res[, 'time'] - x) + which(tdiff == min(tdiff)) + }) + + long_form_data[[i]][, 'time_index'] <- indices + long_form_data[[i]][, 'expected_npts'] <- nrow(res) + } + + long_form_data +} + +# Helping function for getting normalization factors +add_norm <- function(long_form_data, normalization_method) { + for (i in seq_along(long_form_data)) { + data_table <- long_form_data[[i]] + + data_table[['norm']] <- sapply(seq_len(nrow(data_table)), function(j) { + qname <- data_table[j, 'quantity_name'] + + qname_subset <- + data_table[data_table[['quantity_name']] == qname, ] + + if (tolower(normalization_method) == 'mean_max') { + npts <- nrow(qname_subset) + qmax <- max(qname_subset[['quantity_value']]) + npts * qmax^2 + } else { + stop('Unsupported normalization_method: ', normalization_method) + } + }) + + long_form_data[[i]] <- data_table + } + + long_form_data +} + +# Helping function that processes and checks the quantity weights +process_quantity_weights <- function(quantity_weights, long_form_data) { + # First make sure that weights have been provided for all measured + # quantities + all_data_colnames <- lapply(long_form_data, function(x) { + unique(x[, 'quantity_name']) + }) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- unique(all_data_colnames) + + weight_was_supplied <- sapply(all_data_colnames, function(cn) { + cn %in% names(quantity_weights) + }) + + if (any(!weight_was_supplied)) { + missing_weights <- all_data_colnames[!weight_was_supplied] + + msg <- paste( + 'Weights were not supplied for the following measured quantities:', + paste(missing_weights, collapse = ', ') + ) + + stop(msg) + } + + # Now make sure all the weights have length 2 + lapply(quantity_weights, function(wt) { + rep_len(wt, 2) + }) +} + +# Helping function that calculates one error +one_error <- function(observed, predicted, weight, normalization) { + weight_multiplier <- if (predicted < observed) { + weight[1] # Underprediction + } else { + weight[2] # Overprediction + } + + (observed - predicted)^2 * weight_multiplier / normalization +} + +# Helping function that calculates an error value from a simulation result +error_from_res <- function( + simulation_result, + long_form_data_table, + quantity_weights, + normalization_method, + extra_penalty_function +) +{ + # If the simulation did not finish, return a very high value + expected_npts <- long_form_data_table[1, 'expected_npts'] + + if (nrow(simulation_result) < expected_npts) { + return(1e6) + } + + # Calculate any user-specified penalties + penalty <- if (is.null(extra_penalty_function)) { + 0.0 + } else { + extra_penalty_function(simulation_result) + } + + # Calculate the error terms + n_obs <- nrow(long_form_data_table) + + errors <- sapply(seq_len(n_obs), function(i) { + qname <- as.character(long_form_data_table[i, 'quantity_name']) + obs <- long_form_data_table[i, 'quantity_value'] + indx <- long_form_data_table[i, 'time_index'] + pred <- simulation_result[indx, qname] + wt <- quantity_weights[[qname]] + norm <- long_form_data_table[i, 'norm'] + + one_error(obs, pred, wt, norm) + }) + + # Return the sum of the penalty and error terms + penalty + sum(errors) +} + +# Helping function for calculating a regularization penalty term +regularization_penalty <- function( + ind_arg_vals, + regularization_method, + regularization_lambda +) +{ + if (toupper(regularization_method) == 'NONE') { + 0.0 + } else if (toupper(regularization_method) == 'LASSO' || toupper(regularization_method) == 'L1') { + regularization_lambda * sum(abs(ind_arg_vals)) + } else if (toupper(regularization_method) == 'RIDGE' || toupper(regularization_method) == 'L2') { + regularization_lambda * sum(ind_arg_vals^2) + } else { + stop('Unsupported regularization method: ', regularization_method) + } +} + +# Helping function that forms the overall objective function +get_obj_fun <- function( + model_runners, + long_form_data, + processed_weights, + normalization_method, + extra_penalty_function, + regularization_method +) +{ + function(x, lambda = 0) { + errors <- sapply(seq_along(model_runners), function(i) { + runner <- model_runners[[i]] + res <- runner(x) + + error_from_res( + res, + long_form_data[[i]], + processed_weights, + normalization_method, + extra_penalty_function + ) + }) + + reg_penalty <- regularization_penalty(x, regularization_method, lambda) + + sum(errors) + } +} diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R new file mode 100644 index 0000000..89dcda6 --- /dev/null +++ b/R/objective_function_input_checks.R @@ -0,0 +1,226 @@ +# Helping function for checking the data-driver pairs; will throw an error if +# a problem is detected, and will otherwise be silent with no return value. +check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { + if (is.null(names(data_driver_pairs))) { + stop('`data_driver_pairs` must have names') + } + + has_elements <- sapply(data_driver_pairs, function(x) { + 'drivers' %in% names(x) && 'data' %in% names(x) + }) + + if (any(!has_elements)) { + missing_elements <- names(data_driver_pairs)[!has_elements] + + msg <- paste( + 'The following data-driver pairs are missing a `drivers` element,', + 'a `data` element, or both:', + paste(missing_elements, collapse = ', ') + ) + + stop(msg) + } + + has_time <- sapply(data_driver_pairs, function(x) { + 'time' %in% colnames(x[['data']]) + }) + + if (any(!has_time)) { + missing_time <- names(data_driver_pairs)[!has_time] + + msg <- paste( + 'The following data-driver pairs are missing a `time` column', + 'in their `data` element:', + paste(missing_time, collapse = ', ') + ) + + stop(msg) + } + + valid_definitions <- sapply(data_driver_pairs, function(ddp) { + BioCro::validate_dynamical_system_inputs( + base_model_definition[['initial_values']], + base_model_definition[['parameters']], + ddp[['drivers']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], + verbose = FALSE + ) + }) + + if (any(!valid_definitions)) { + invalid_ddp <- names(data_driver_pairs)[!valid_definitions] + + msg <- paste( + 'The following drivers did not form a valid dynamical system:', + paste(invalid_ddp, collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the independent argument names and initial +# values +check_independent_arguments <- function( + independent_arg_names, + initial_ind_arg_values +) +{ + if (length(independent_arg_names) != length(initial_ind_arg_values)) { + stop( + '`independent_arg_names` and `initial_ind_arg_values` ', + 'must have the same length' + ) + } + + if (is.null(names(initial_ind_arg_values))) { + stop('`initial_ind_arg_values` must have names') + } + + if (any(!names(initial_ind_arg_values) %in% independent_arg_names)) { + bad_arg <- !names(initial_ind_arg_values) %in% independent_arg_names + + msg <- paste( + 'The following arguments are included in `initial_ind_arg_values`', + 'but not `independent_arg_names`:', + paste(names(initial_ind_arg_values)[bad_arg], collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the model runners; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_runners <- function(model_runners) { + bad_runners <- sapply(model_runners, is.character) + + if (any(bad_runners)) { + bad_runner_names <- names(model_runners)[bad_runners] + bad_runner_msg <- as.character(model_runners[bad_runners]) + + msg <- paste0( + 'Model runners could not be created for the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the initial model runner results; will throw an +# error if a problem is detected, and will otherwise be silent with no return +# value. +check_runner_results <- function( + initial_runner_res, + full_data_definitions, + data_driver_pairs +) +{ + # Check for runners that could not be evaluated + bad_res <- sapply(initial_runner_res, is.character) + + if (any(bad_res)) { + bad_runner_names <- names(initial_runner_res)[bad_res] + bad_runner_msg <- initial_runner_res[bad_res] + + msg <- paste0( + 'The model could not be run with the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + # Make sure each runner produces the necessary columns in its output + expected_columns <- as.character(full_data_definitions) + + missing_columns <- lapply(initial_runner_res, function(res) { + expected_columns[!expected_columns %in% colnames(res)] + }) + + bad_outputs <- sapply(missing_columns, function(x) { + length(x) > 0 + }) + + if (any(bad_outputs)) { + msg <- 'Some data columns were missing from runner outputs:' + + for (i in seq_along(bad_outputs)) { + if (bad_outputs[i]) { + msg <- append( + msg, + paste0( + names(initial_runner_res)[i], ': ', + paste(missing_columns[[i]], collapse = ', ') + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + + # Make sure the output from each runner includes the observed times + times_out_of_range <- lapply(seq_along(initial_runner_res), function(i) { + res <- initial_runner_res[[i]] + + min_time <- min(res[['time']]) + max_time <- max(res[['time']]) + + data_times <- data_driver_pairs[[i]][['data']][['time']] + + oor <- sapply(data_times, function(datat) { + datat < min_time || datat > max_time + }) + + data_times[oor] + }) + + bad_times <- sapply(times_out_of_range, function(x) { + length(x) > 0 + }) + + if (any(bad_times)) { + msg <- 'Some observed times were missing from runner outputs:' + + for (i in seq_along(bad_times)) { + if (bad_times[i]) { + msg <- append( + msg, + paste0( + names(initial_runner_res)[i], ': ', + paste(times_out_of_range[[i]], collapse = ', ') + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + + return(invisible(NULL)) +} + +# Helping function for checking the objective function; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_obj_fun <- function(obj_fun, initial_ind_arg_values) { + initial_error <- obj_fun(as.numeric(initial_ind_arg_values)) + + if (!is.finite(initial_error)) { + stop( + 'The objective function did not return a finite value when using ', + 'the initial argument values; instead, it returned: ', + initial_error + ) + } + + return(invisible(NULL)) +} From 5e04df6e2909a478caf3bb2c1312417129b044fe Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 5 May 2025 20:27:32 -0500 Subject: [PATCH 14/54] Start documenting `objective_function` --- R/objective_function.R | 4 +- R/objective_function_helpers.R | 3 + R/objective_function_input_checks.R | 4 + man/objective_function.Rd | 218 ++++++++++++++++++++++++++++ 4 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 man/objective_function.Rd diff --git a/R/objective_function.R b/R/objective_function.R index 1fc3380..31bbbf2 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -1,5 +1,5 @@ -# Here we use several functions that are defined in -# `objective_function_input_checks.R` and `objective_function_helpers.R` +## Here we use several functions that are defined in +## `objective_function_input_checks.R` and `objective_function_helpers.R` objective_function <- function( base_model_definition, diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index a787d64..671ab2c 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -1,3 +1,6 @@ +## All the functions defined in this file are intended to perform key operations +## required by `objective_function`. + # Helping function for getting a model runner; if the runner cannot be created, # an error message will be returned instead get_model_runner <- function( diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 89dcda6..a4da837 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -1,3 +1,7 @@ +## All the functions defined in this file are intended to check inputs for +## certain errors. Each function will throw an error if a problem is detected, +## and will otherwise be silent with no return value. + # Helping function for checking the data-driver pairs; will throw an error if # a problem is detected, and will otherwise be silent with no return value. check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { diff --git a/man/objective_function.Rd b/man/objective_function.Rd new file mode 100644 index 0000000..ac5f421 --- /dev/null +++ b/man/objective_function.Rd @@ -0,0 +1,218 @@ +\name{objective_function} + +\alias{objective_function} + +\title{Generate an objective function for BioCro model validation} + +\description{ + Given a base model definition, drivers to run the model, observed values of + model outputs, and the names of model arguments to vary, + \code{objective_function} creates an objective function that can be used to + parameterize the model. + + The objective function itself is based on a weighted least-squares error + metric, with optional user-defined penalty terms, and an optional + regularization penalty term. + + It is possible to define a multi-year or multi-location objective function by + pairing particular sets of drivers with corresponding sets of observed model + outputs. + + It is also possible to include "dependent" model arguments, whose values are + determined from the "independent" model arguments that are varied during the + parameterization procedure. +} + +\usage{ + objective_function( + base_model_definition, + data_driver_pairs, + independent_arg_names, + initial_ind_arg_values, + quantity_weights, + data_definitions = list(), + normalization_method = 'mean_max', + dependent_arg_function = NULL, + post_process_function = NULL, + extra_penalty_function = NULL, + regularization_method = 'none' + ) +} + +\arguments{ + + \item{base_model_definition{ + A list meeting the requirements for BioCro + \code{\link[BioCro]{crop_model_definitions}}. + } + + \item{data_driver_pairs{ + A list of named elements, where each element is a "data-driver pair." A + data-driver pair is a list with two elements, \code{data} and + \code{drivers}. The \code{data} element must be a data frame with one column + named \code{time}, whose values follow BioCro's definition of + \code{\link[BioCro]{time}}; the other columns should represent observed + values of model outputs. The \code{drivers} element must be a data frame + that can be passed to \code{\link[BioCro]{run_biocro}} as its \code{drivers} + input argument. + } + + \item{independent_arg_names{ + A vector or list of the names of independent model quantities to be + varied; will be passed to \code{\link[BioCro]{partial_run_biocro}}. + } + + \item{initial_ind_arg_values{ + A list of named numeric values, where the name of each element should be one + of the quantities in \code{independent_arg_names}. + } + + \item{quantity_weights{ + A list of named numeric values, where the name of each element is one of the + model outputs to be compared against the observed data, and the value is the + weight for that output. + } + + \item{data_definitions{ + + } + + \item{normalization_method{ + + } + + \item{dependent_arg_function{ + + } + + \item{post_process_function{ + + } + + \item{extra_penalty_function{ + + } + + \item{regularization_method{ + + } +} + +\details{ + +} + +\value{ + A function \code{obj_fun} with signature \code{obj_fun(x, lambda = 0)}. Here, + \code{x} is a numeric vector of values of the independent arguments (in the + same order as in \code{independent_arg_names}), and \code{lambda} is the value + of the regularization parameter. The \code{obj_fun} function returns values of + the error metric, as described above. +} + +\examples{ +# Example: Create an objective function that enables optimization of the +# `alphaLeaf`, `betaLeaf`, and `alphaStem` parameters of the Soybean-BioCro +# model. Additional details are provided below. + + +if (require(BioCro)) { + # We will use Soybean-BioCro as the base model definition, but we will change + # the ODE solver to use the Euler method so the model runs faster. + base_model_definition <- BioCro::soybean + base_model_definition$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] + + # We will use the `soyface_biomass` data set (included with the + # `BioCroValidation` package) for the observed values; this set includes + # observations of leaf, stem, and pod biomass from two years, which are stored + # in two data tables. However, these data tables each have a `DOY` column + # rather than a `time` column, so we need to alter them. + ambient_2002_data <- within(soyface_biomass[['ambient_2002']], { + time = (DOY - 1) * 24.0 # Define new `time` column + DOY = NULL # Remove unneeded `DOY` column + }) + + ambient_2005_data <- within(soyface_biomass[['ambient_2005']], { + time = (DOY - 1) * 24.0 # Define new `time` column + DOY = NULL # Remove unneeded `DOY` column + }) + + # The data-driver pairs can now be created by associating each data set with + # its corresponding weather data. + data_driver_pairs <- list( + ambient_2002 = list( + data = ambient_2002_data, + drivers = BioCro::soybean_weather[['2002']] + ), + ambient_2005 = list( + data = ambient_2005_data, + drivers = BioCro::soybean_weather[['2005']] + ) + ) + + # In the data, the leaf biomass is in the `Leaf_Mg_per_ha` column, but in the + # simulation output, it is in the `Leaf` column. Similar naming differences + # occur for the stem and pod mass. To address this, we can provide a data + # definition list. + data_definitions <- list( + Leaf_Mg_per_ha = 'Leaf', + Stem_Mg_per_ha = 'Stem', + Pod_Mg_per_ha = 'Pod' + ) + + # The data contains values of pod mass, but the model does not calculate pod + # mass; instead, it returns separate values of `Grain` (seed) and `Shell` + # mass, two components which form the pod together. To address this, we can + # provide a post-processing function to calculate the pod mass. + post_process_function <- function(x) { + within(x, {Pod = Grain + Shell}) + } + + # Here we wish to independently vary the `alphaLeaf` and `betaLeaf` + # parameters. We also wish to vary `alphaStem`, but require that its value is + # always equal to `alphaLeaf`. To do this, we can specify independent argument + # names, and a function for determining dependent argument values. + independent_arg_names <- c('alphaLeaf', 'betaLeaf') + + dependent_arg_function <- function(x) { + list(alphaStem = x[['alphaLeaf']]) + } + + # We will choose initial values of `alphaLeaf` and `betaLeaf` based on the + # original Soybean-BioCro model definition. + initial_ind_arg_values <- list( + alphaLeaf = BioCro::soybean$parameters$alphaLeaf, + betaLeaf = BioCro::soybean$parameters$betaLeaf + ) + + # When determining the error metric value, we wish to weight the pod twice as + # large as the leaf and stem to ensure a close fit to the pod masses. + quantity_weights <- list( + Leaf = 0.5, + Stem = 0.5, + Pod = 1 + ) + + # Now we can finally create the objective function. + obj_fun <- objective_function( + base_model_definition, + data_driver_pairs, + independent_arg_names, + initial_ind_arg_values, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function + ) + + # This function could now be passed to an optimizer; here we will simply + # evaluate it for two sets of parameter values. + + # First try the initial values. + obj_fun(as.numeric(initial_ind_arg_values)) + + # Now try doubling each parameter value; in this case, the value of the + # objective function increases, indicating a lower degree of agreement between + # the model and the observed data. + obj_fun(2 * as.numeric(initial_ind_arg_values)) +} +} From 3ca762c8f2dfa119c6038a2e782a36224f79235d Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 5 May 2025 20:49:29 -0500 Subject: [PATCH 15/54] Consolidate two input arguments --- R/objective_function.R | 21 ++---- R/objective_function_helpers.R | 19 +++--- R/objective_function_input_checks.R | 32 ++------- man/objective_function.Rd | 59 +++++++---------- tests/testthat/test-objective_function.R | 83 ++++++------------------ 5 files changed, 62 insertions(+), 152 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 31bbbf2..9780036 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -4,8 +4,7 @@ objective_function <- function( base_model_definition, data_driver_pairs, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = list(), normalization_method = 'mean_max', @@ -19,17 +18,13 @@ objective_function <- function( check_data_driver_pairs(base_model_definition, data_driver_pairs) # Check the independent arguments - check_independent_arguments( - independent_arg_names, - initial_ind_arg_values - ) + check_independent_arguments(independent_args) # Get the model runners model_runners <- lapply(data_driver_pairs, function(ddp) { get_model_runner( base_model_definition, - independent_arg_names, - initial_ind_arg_values, + independent_args, dependent_arg_function, post_process_function, ddp @@ -45,7 +40,7 @@ objective_function <- function( # Get initial model runner results initial_runner_res <- - get_initial_runner_res(model_runners, initial_ind_arg_values) + get_initial_runner_res(model_runners, independent_args) # Check the initial model runner results check_runner_results( @@ -59,11 +54,7 @@ objective_function <- function( get_long_form_data(data_driver_pairs, full_data_definitions) # Find indices corresponding to the measured time points - long_form_data <- add_time_indices( - initial_runner_res, - initial_ind_arg_values, - long_form_data - ) + long_form_data <- add_time_indices(initial_runner_res, long_form_data) # Add normalization factors long_form_data <- add_norm(long_form_data, normalization_method) @@ -83,7 +74,7 @@ objective_function <- function( ) # Check the objective function - check_obj_fun(obj_fun, initial_ind_arg_values) + check_obj_fun(obj_fun, independent_args) # Return it obj_fun diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 671ab2c..e6e4c91 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -5,19 +5,21 @@ # an error message will be returned instead get_model_runner <- function( base_model_definition, - independent_arg_names, - initial_ind_arg_values, + independent_args, dependent_arg_function, post_process_function, ddp ) { + # Get the independent argument names + independent_arg_names <- names(independent_args) + # Get the full list of arg_names arg_names <- if (is.null(dependent_arg_function)) { independent_arg_names } else { dependent_arg_values <- - dependent_arg_function(initial_ind_arg_values) + dependent_arg_function(independent_args) c(independent_arg_names, names(dependent_arg_values)) } @@ -62,10 +64,10 @@ get_model_runner <- function( } # Helping function for running each runner with the initial argument values -get_initial_runner_res <- function(model_runners, initial_ind_arg_values) { +get_initial_runner_res <- function(model_runners, independent_args) { lapply(model_runners, function(runner) { tryCatch( - runner(as.numeric(initial_ind_arg_values)), + runner(as.numeric(independent_args)), error = function(e) {as.character(e)} ) }) @@ -124,12 +126,7 @@ get_long_form_data <- function(data_driver_pairs, full_data_definitions) { } # Helping function for getting time indices -add_time_indices <- function( - initial_runner_res, - initial_ind_arg_values, - long_form_data -) -{ +add_time_indices <- function(initial_runner_res, long_form_data) { for (i in seq_along(long_form_data)) { res <- initial_runner_res[[i]] dataf <- long_form_data[[i]] diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index a4da837..77c5381 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -66,34 +66,10 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { return(invisible(NULL)) } -# Helping function for checking the independent argument names and initial -# values -check_independent_arguments <- function( - independent_arg_names, - initial_ind_arg_values -) -{ - if (length(independent_arg_names) != length(initial_ind_arg_values)) { - stop( - '`independent_arg_names` and `initial_ind_arg_values` ', - 'must have the same length' - ) - } - - if (is.null(names(initial_ind_arg_values))) { - stop('`initial_ind_arg_values` must have names') - } - - if (any(!names(initial_ind_arg_values) %in% independent_arg_names)) { - bad_arg <- !names(initial_ind_arg_values) %in% independent_arg_names - - msg <- paste( - 'The following arguments are included in `initial_ind_arg_values`', - 'but not `independent_arg_names`:', - paste(names(initial_ind_arg_values)[bad_arg], collapse = ', ') - ) - - stop(msg) +# Helping function for checking the independent arguments +check_independent_arguments <- function(independent_args) { + if (is.null(names(independent_args))) { + stop('`independent_args` must have names') } return(invisible(NULL)) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index ac5f421..1c3ff06 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -27,8 +27,7 @@ objective_function( base_model_definition, data_driver_pairs, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = list(), normalization_method = 'mean_max', @@ -40,13 +39,12 @@ } \arguments{ - - \item{base_model_definition{ + \item{base_model_definition}{ A list meeting the requirements for BioCro \code{\link[BioCro]{crop_model_definitions}}. } - \item{data_driver_pairs{ + \item{data_driver_pairs}{ A list of named elements, where each element is a "data-driver pair." A data-driver pair is a list with two elements, \code{data} and \code{drivers}. The \code{data} element must be a data frame with one column @@ -57,43 +55,40 @@ input argument. } - \item{independent_arg_names{ - A vector or list of the names of independent model quantities to be - varied; will be passed to \code{\link[BioCro]{partial_run_biocro}}. - } - - \item{initial_ind_arg_values{ - A list of named numeric values, where the name of each element should be one - of the quantities in \code{independent_arg_names}. + \item{independent_args}{ + A list of named numeric values. The names will determine the independent + arguments to be varied during their optimization, and the values specify + "test" values of each argument that will be used internally to check that + the objective function is properly defined and can be evaluated. } - \item{quantity_weights{ + \item{quantity_weights}{ A list of named numeric values, where the name of each element is one of the model outputs to be compared against the observed data, and the value is the weight for that output. } - \item{data_definitions{ + \item{data_definitions}{ } - \item{normalization_method{ + \item{normalization_method}{ } - \item{dependent_arg_function{ + \item{dependent_arg_function}{ } - \item{post_process_function{ + \item{post_process_function}{ } - \item{extra_penalty_function{ + \item{extra_penalty_function}{ } - \item{regularization_method{ + \item{regularization_method}{ } } @@ -170,23 +165,18 @@ if (require(BioCro)) { # Here we wish to independently vary the `alphaLeaf` and `betaLeaf` # parameters. We also wish to vary `alphaStem`, but require that its value is - # always equal to `alphaLeaf`. To do this, we can specify independent argument - # names, and a function for determining dependent argument values. - independent_arg_names <- c('alphaLeaf', 'betaLeaf') + # always equal to `alphaLeaf`. To do this, we can specify independent + # arguments, and a function for determining dependent argument values. We will + # choose "test" values of the independent arguments as their values in the + # original Soybean-BioCro model. + independent_args <- BioCro::soybean[['parameters']][c('alphaLeaf', 'betaLeaf')] dependent_arg_function <- function(x) { list(alphaStem = x[['alphaLeaf']]) } - # We will choose initial values of `alphaLeaf` and `betaLeaf` based on the - # original Soybean-BioCro model definition. - initial_ind_arg_values <- list( - alphaLeaf = BioCro::soybean$parameters$alphaLeaf, - betaLeaf = BioCro::soybean$parameters$betaLeaf - ) - # When determining the error metric value, we wish to weight the pod twice as - # large as the leaf and stem to ensure a close fit to the pod masses. + # large as the leaf and stem to ensure a close fit to the observed pod masses. quantity_weights <- list( Leaf = 0.5, Stem = 0.5, @@ -197,8 +187,7 @@ if (require(BioCro)) { obj_fun <- objective_function( base_model_definition, data_driver_pairs, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -208,11 +197,11 @@ if (require(BioCro)) { # evaluate it for two sets of parameter values. # First try the initial values. - obj_fun(as.numeric(initial_ind_arg_values)) + obj_fun(as.numeric(independent_args)) # Now try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. - obj_fun(2 * as.numeric(initial_ind_arg_values)) + obj_fun(2 * as.numeric(independent_args)) } } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 73a480c..7b1ef24 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -13,9 +13,7 @@ ddps <- list( ) ) -independent_arg_names <- c('alphaLeaf', 'betaLeaf') - -initial_ind_arg_values <- with(BioCro::soybean[['parameters']], { +independent_args <- with(BioCro::soybean[['parameters']], { list(alphaLeaf = alphaLeaf, betaLeaf = betaLeaf) }) @@ -46,8 +44,7 @@ test_that('Objective functions can be created and behave as expected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -55,11 +52,11 @@ test_that('Objective functions can be created and behave as expected', { ) expect_silent( - obj_fun(as.numeric(initial_ind_arg_values)) + obj_fun(as.numeric(independent_args)) ) expect_silent( - obj_fun(as.numeric(initial_ind_arg_values), 0.5) + obj_fun(as.numeric(independent_args), 0.5) ) # One data-driver pair, no dependent arguments @@ -67,8 +64,7 @@ test_that('Objective functions can be created and behave as expected', { objective_function( model, ddps[1], - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -80,8 +76,7 @@ test_that('Objective functions can be created and behave as expected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, dependent_arg_function = dependent_arg_function, @@ -95,8 +90,7 @@ test_that('Bad definitions are detected', { objective_function( model, within(ddps, {ambient_2005$drivers$temp = NULL}), - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -105,44 +99,17 @@ test_that('Bad definitions are detected', { ) }) -test_that('Independent argument names must be consistent', { - expect_error( - objective_function( - model, - ddps, - independent_arg_names, - c(), - quantity_weights, - data_definitions = data_definitions, - post_process_function = post_process_function - ), - '`independent_arg_names` and `initial_ind_arg_values` must have the same length' - ) - - expect_error( - objective_function( - model, - ddps, - independent_arg_names, - list(arg1 = 1, arg2 = 2), - quantity_weights, - data_definitions = data_definitions, - post_process_function = post_process_function - ), - 'The following arguments are included in `initial_ind_arg_values` but not `independent_arg_names`: arg1, arg2' - ) - +test_that('Independent arguments must have names', { expect_error( objective_function( model, ddps, - independent_arg_names, - as.numeric(initial_ind_arg_values), + as.numeric(independent_args), quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function ), - '`initial_ind_arg_values` must have names' + '`independent_args` must have names' ) }) @@ -151,8 +118,7 @@ test_that('Bad argument names are detected', { objective_function( model, ddps, - c(independent_arg_names, 'bad_arg_name'), - c(initial_ind_arg_values, list(bad_arg_name = 1)), + c(independent_args, list(bad_arg_name = 1)), quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -169,8 +135,7 @@ test_that('Model failures are detected', { objective_function( within(model, {parameters$lnfun = 1}), ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -187,8 +152,7 @@ test_that('Data-driver pairs must be complete', { objective_function( model, within(ddps, {ambient_2002$data = NULL; ambient_2005$drivers = NULL}), - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -202,8 +166,7 @@ test_that('Data must have a `time` column', { objective_function( model, within(ddps, {ambient_2002$data$time = NULL}), - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -217,8 +180,7 @@ test_that('Missing simulation outputs are detected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions ), @@ -234,8 +196,7 @@ test_that('Out-of-range times are detected', { objective_function( model, within(ddps, {ambient_2002$data$time <- ambient_2002$data$time + 1e5}), - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function @@ -251,8 +212,7 @@ test_that('Weights must be supplied for all measured quantities', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, list(), data_definitions = data_definitions, post_process_function = post_process_function @@ -266,8 +226,7 @@ test_that('Bad normalization methods are detected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, normalization_method = 'bad_normalization_method', data_definitions = data_definitions, @@ -282,8 +241,7 @@ test_that('Bad return values are detected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, @@ -298,8 +256,7 @@ test_that('Bad regularization methods are detected', { objective_function( model, ddps, - independent_arg_names, - initial_ind_arg_values, + independent_args, quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, From 86c766f2f6396db6cbe348634936701654ee5ddf Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 5 May 2025 22:09:46 -0500 Subject: [PATCH 16/54] More documentation --- man/objective_function.Rd | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 1c3ff06..e6d426a 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -7,8 +7,9 @@ \description{ Given a base model definition, drivers to run the model, observed values of model outputs, and the names of model arguments to vary, - \code{objective_function} creates an objective function that can be used to - parameterize the model. + \code{objective_function} creates an objective function that can be passed to + a minimization algorithm in order to find optimal parameter values that + produce the best agreement between the model and the observed data. The objective function itself is based on a weighted least-squares error metric, with optional user-defined penalty terms, and an optional @@ -69,27 +70,44 @@ } \item{data_definitions}{ - + A list of named string values, where the name of each element is one of the + data columns in the \code{data_driver_pairs} and the value is that column's + corresponding name in the model output. For all other data columns in the + \code{data_driver_pairs}, it is assumed that the data column name matches a + column in the model output. } \item{normalization_method}{ - + A string indicating the normalization method to be used when calculating the + error metric; see below for more details. } \item{dependent_arg_function}{ - + A function whose input argument is a named list of independent argument + values, and which returns a named list of dependent argument values. If the + \code{dependent_arg_function} is \code{NULL}, no dependent argument values + will be calculated. } \item{post_process_function}{ - + A function whose input argument is a data frame representing the output from + \code{\link[BioCro]{run_biocro}}, and which returns a data frame, typically + based on the input but with one or more new columns. If the + \code{post_process_function} is \code{NULL}, no post-processing will be + applied to the raw simulation output. } \item{extra_penalty_function}{ - + A function whose input argument is a data frame representing the output from + \code{\link[BioCro]{run_biocro}}, and which returns a numeric penalty to be + added to the least-squares term when calculating the error metric. If the + \code{extra_penalty_function} is \code{NULL}, no extra penalties will be + added. } \item{regularization_method}{ - + A string indicating the regularization method to be used when calculating + the regularization penalty term; see below for more details. } } From a0df150b277c2d43ff7de688adf1c9857be8933f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 11:07:16 -0500 Subject: [PATCH 17/54] More documentation and normalization methods --- R/objective_function_helpers.R | 8 +- man/objective_function.Rd | 225 +++++++++++++++++++++-- tests/testthat/test-objective_function.R | 15 +- 3 files changed, 229 insertions(+), 19 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index e6e4c91..603010b 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -154,7 +154,13 @@ add_norm <- function(long_form_data, normalization_method) { qname_subset <- data_table[data_table[['quantity_name']] == qname, ] - if (tolower(normalization_method) == 'mean_max') { + if (tolower(normalization_method) == 'none') { + 1.0 + } else if (tolower(normalization_method) == 'mean') { + nrow(qname_subset) + } else if (tolower(normalization_method) == 'max') { + max(qname_subset[['quantity_value']])^2 + } else if (tolower(normalization_method) == 'mean_max') { npts <- nrow(qname_subset) qmax <- max(qname_subset[['quantity_value']]) npts * qmax^2 diff --git a/man/objective_function.Rd b/man/objective_function.Rd index e6d426a..d24a616 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -66,7 +66,10 @@ \item{quantity_weights}{ A list of named numeric values, where the name of each element is one of the model outputs to be compared against the observed data, and the value is the - weight for that output. + weight for that output. Each weight can be a single number, or a pair of + numbers. When the weight is a pair, the first number is the weight that will + be used for underestimates (when the modeled value is smaller than the + observed value), and the second is the weight for overestimates. } \item{data_definitions}{ @@ -112,7 +115,191 @@ } \details{ + \strong{Overview} + + When parameterizing a BioCro model, the general idea is to vary a subset of + the model's parameters to achieve the best agreement with a set of observed + data. The degree of agreement is expressed as an "error metric," which + includes terms derived from the agreement between the modeled and observed + values, as well as (optional) penalty terms. A function that calculates an + error metric when given a set of parameter values is called an "objective + function." Defining an objective function suitable for BioCro parameterization + can be complicated, but the \code{objective_function} function helps to + simplify the process of creating such a function. It is designed to + accommodate the following scenarios, which often occur in the context of + BioCro model parameterization: + + \itemize{ + \item \strong{Multi-year or multi-location data:} Often the model needs to + be run several times with different drivers corresponding to multiple + years or locations, and then the results from each individual + simulation must be compared to associated sets of observed data. Here, + this is handled through the \code{data_driver_pairs}, which allows the + user to specify which drivers and data sets should be compared to each + other. + + \item \strong{Complicated normalization:} Care must be taken to ensure that + certain years or output variables are not over-valued in the error + metric; for example, one year may have more observations of leaf mass + than another year, or the stem mass may be much larger than the leaf + mass. Here, this is handled through pre-set normalization approaches, + which can be specified through the \code{normalization_method} input. + See below for more information. + + \item \strong{Extra penalties:} Sometimes an optimizer chooses parameters + that produce close agreement with the observed data, but are + nevertheless not biologically resonable. For example, it may produce + a sharp peak at a high leaf mass in between two measured points, when + in reality, leaf mass should vary monotonically between them. In this + case, it may be necessarily to add an extra penalty to the objective + function that prevents the optimizer from choosing such values. Here, + this is handled through the \code{extra_penalty_function} input. + + \item \strong{Flexible weights:} Typically a user would like to specify a + weight for each variable being considered in the error metric, either + to represent an uncertainty or to emphasize agreement with one output + at the expense of another. For example, the seed mass may need a high + weight to prioritize accurate yield predictions. Further, these + weights may need to differ for underestimates as compared to + overestimates; for example, measured root mass is often lower than the + true root mass, so a user may wish to penalize underestimates of root + mass more severely than overestimates. Here, this is handled through + the \code{quantity_weights} input. + + \item \strong{Dependent parameters:} Sometimes one model parameter must be + determined from one or more other parameters; for example, a user may + wish to require that the leaf and stem maintenance respiration + coefficients are identical. Here, this is handled through the + \code{dependent_arg_function}, which allows the user to specify how + the values of such "dependent" parameters should be determined from + the values of the "independent" parameters. + + \item \strong{Name mismatches:} Often a particular variable has different + names in the data set and simulation output. Here, this is handled + through the \code{data_definitions}, which allows the user to specify + which columns in the model output should be compared to particular + columns in the observed data. + + \item \strong{Incomplete outputs:} Sometimes a model may not produce outputs + that are directly comparable to the observed values; for example, a + model may calculate seed and shell mass, while a data set includes pod + mass, which is the sum of seed and shell. Here, this is handled by an + optional \code{post_process_function}, which allows users to specify + additional operations to perform on the model output; in this example, + it would be used to calculate the pod mass so it can be compared to + the observations. + } + + \strong{Error metric calculations} + + As mentioned above, the overall error metric \eqn{E} is calculated as + + \deqn{E = E_{agreement} + P_{user} + P_{regularization},} + where \eqn{E_{agreement}} is determined by the agreement between the model and + observations, \eqn{P_{user}} is an optional user-defined penalty, and + \eqn{P_{regularization}} is an optional regularization penalty. These terms + are explained in more detail below: + + \itemize{ + \item \strong{Agreement term:} The agreement term \eqn{E_{agreement}} is + calculated using a least-squares approach. In other words, + + \deqn{E_{agreement} =% + \sum_i \frac{\left(Y_{obs}^i - Y_{mod}^i \right)^2}{N_i} \cdot w_i,} + where the sum runs over all \eqn{n} observations, \eqn{Y_{obs}^i} and + \eqn{Y_{mod}^i} are observed and modeled values of variable \eqn{Y_i}, + \eqn{N_i} is a normalization factor that generally depends on the name + of \eqn{Y_i} and the set of drivers associated with the \eqn{i^{th}} + observation, and \eqn{w_i} is a weight factor that generally depends + on the name of \eqn{Y}. + + The normalization factors \eqn{N_i} are determined by the choice of + \code{normalization_method}; the available methods are discussed + below. + + The weight factors \eqn{w_i} are directly specified by the user via + the \code{quantity_weights} input. For example, if + \code{quantity_weights} has an element named \code{Leaf} that is equal + to 0.5, then \eqn{w_i} will be equal to 0.5 whenever \eqn{Y_i} + represents a leaf mass value, regardless of which set of drivers or + time point corresponds to \eqn{Y_i}. The weights can also be supplied + as \eqn{(w_{under}, w_{over})} pairs instead of single values; in this + case, the value of \eqn{w_i} depends on whether the model makes an + underprediction or an overprediction: \eqn{w_i = w_{under}} when + \eqn{Y_{mod}^i < Y_{obs}^i} and \eqn{w_i = w_{over}} otherwise. + + \item \strong{User-defined penalty term:} The user-defined penalty term + \eqn{P_{user}} is calculated by applying a function \eqn{f_{user}} to + the full simulation output from each set of drivers. In other words, + + \deqn{P_{user} = \sum_k f \left( M_k \right),} + where the sum runs over all \eqn{k} sets of drivers and \eqn{M_k} is + the model output when it is run with the \eqn{k^{th}} set of drivers. + + The function \eqn{f_{user}} must accept a single data frame as an + input and return a single numeric value as its output, but has no + other requirements. It is specified via the + \code{extra_penalty_function} argument. When + \code{extra_penalty_function} is \code{NULL}, \eqn{P_{user}} is zero. + + \item \strong{Regularization penalty term:} The regularization penalty term + \eqn{P_{regularization}} is calculated from the values of the + arguments being varied during the optimization by applying a function + \eqn{R}. In other words, + + \deqn{E_{regularization} = R \left( X \right),} + where \eqn{X} represents the model argument values. + + The function \eqn{R} is determined by the choice of + \code{regularization_method}; the available methods are discussed + below. + } + + \strong{Normalization methods} + + The following normalization methods are available: + + \itemize{ + \item \strong{none:} For this method, \eqn{N_i} is always set to 1. In other + words, no normalization is performed. + + \item \strong{mean:} For this method, when \eqn{Y_i} is named \code{vtype} + and the observation is associated with a set of drivers called + \code{vdrivers}, then + + \deqn{N_i = n_{vtype}^{vdrivers},} + where \eqn{n_{vtype}^{vdrivers}} is the number of observations of type + \code{vtype} that are associated with drivers \code{vdrivers}. In this + case, the error term \eqn{E_{agreement}} becomes a mean error for each + set of drivers, hence the name for this method. This approach avoids + over-representing drivers with larger numbers of associated + observations when determining \eqn{E_{agreement}}. + + \item \strong{max:} For this method, when \eqn{Y_i} is named \code{vtype} + and the observation is from a set called \code{vdata}, then + + \deqn{N_i = \left( max_{vtype}^{vdata} \right)^2,} + where \eqn{max_{vtype}^{vdata}} is the maximum observed value of + \code{vtype} across \code{vdata}. In this case, the observed and + modeled values that appear in the equation for \eqn{E_{agreement}} are + essentially normalized by their maximum value, hence the name for this + method. This approach avoids over-representing variable types with + larger magnitude when determining \eqn{E_{agreement}}. + + \item \strong{mean_max:} For this method, the "mean" and "max" methods are + combined so that + + \deqn{N_i =% + n_{vtype}^{vdrivers} \cdot \left( max_{vtype}^{vdata} \right)^2.} + This approach avoids over-representing drivers with larger numbers of + associated observations, and variable types with larger magnitudes. + This method is used for parameterizing Soybean-BioCro, and is + recommended for most situations. + } + When + \code{regularization_method} is \code{'none'}, \eqn{P_{regularization}} is + zero. } \value{ @@ -126,8 +313,9 @@ \examples{ # Example: Create an objective function that enables optimization of the # `alphaLeaf`, `betaLeaf`, and `alphaStem` parameters of the Soybean-BioCro -# model. Additional details are provided below. - +# model. Additional details are provided below. Important note: This example is +# designed to highlight key features of `objective_function`, and is not +# necessarily realistic. if (require(BioCro)) { # We will use Soybean-BioCro as the base model definition, but we will change @@ -177,8 +365,8 @@ if (require(BioCro)) { # mass; instead, it returns separate values of `Grain` (seed) and `Shell` # mass, two components which form the pod together. To address this, we can # provide a post-processing function to calculate the pod mass. - post_process_function <- function(x) { - within(x, {Pod = Grain + Shell}) + post_process_function <- function(sim_res) { + within(sim_res, {Pod = Grain + Shell}) } # Here we wish to independently vary the `alphaLeaf` and `betaLeaf` @@ -189,18 +377,32 @@ if (require(BioCro)) { # original Soybean-BioCro model. independent_args <- BioCro::soybean[['parameters']][c('alphaLeaf', 'betaLeaf')] - dependent_arg_function <- function(x) { - list(alphaStem = x[['alphaLeaf']]) + dependent_arg_function <- function(ind_args) { + list(alphaStem = ind_args[['alphaLeaf']]) } - # When determining the error metric value, we wish to weight the pod twice as - # large as the leaf and stem to ensure a close fit to the observed pod masses. + # When determining the error metric value, we wish to weight the pod highest + # to ensure a close fit to the observed pod masses. We also wish to decrease + # the penalty for overestimates of the stem mass, since we believe our + # observations to be underestimates. quantity_weights <- list( Leaf = 0.5, - Stem = 0.5, + Stem = c(0.5, 0.25), Pod = 1 ) + # We want to prevent the optimizer from choosing parameters that produce + # unreasonably high leaf mass + extra_penalty_function <- function(sim_res) { + max_leaf <- max(sim_res[['Leaf']], na.rm = TRUE) + + if (is.na(max_leaf) || max_leaf > 4) { + 1e5 # Add a steep penalty + } else { + 0 + } + } + # Now we can finally create the objective function. obj_fun <- objective_function( base_model_definition, @@ -208,7 +410,8 @@ if (require(BioCro)) { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + extra_penalty_function = extra_penalty_function ) # This function could now be passed to an optimizer; here we will simply diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 7b1ef24..2b3c697 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -33,7 +33,7 @@ post_process_function <- function(x) { quantity_weights <- list( Leaf = 0.5, - Stem = 0.5, + Stem = c(0.5, 0.25), Pod = 1 ) @@ -55,10 +55,6 @@ test_that('Objective functions can be created and behave as expected', { obj_fun(as.numeric(independent_args)) ) - expect_silent( - obj_fun(as.numeric(independent_args), 0.5) - ) - # One data-driver pair, no dependent arguments obj_fun <- expect_silent( objective_function( @@ -71,7 +67,7 @@ test_that('Objective functions can be created and behave as expected', { ) ) - # Two data-driver pairs, with dependent arguments + # Two data-driver pairs, with dependent arguments and L2 regularization obj_fun <- expect_silent( objective_function( model, @@ -80,9 +76,14 @@ test_that('Objective functions can be created and behave as expected', { quantity_weights, data_definitions = data_definitions, dependent_arg_function = dependent_arg_function, - post_process_function = post_process_function + post_process_function = post_process_function, + regularization_method = 'L2' ) ) + + expect_silent( + obj_fun(as.numeric(independent_args), 0.5) + ) }) test_that('Bad definitions are detected', { From 741b115b0d99ed56a8ed40e5e3f543ec88dceb6e Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 13:42:46 -0500 Subject: [PATCH 18/54] Update objective_function.Rd --- man/objective_function.Rd | 110 ++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index d24a616..3db273f 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -208,10 +208,17 @@ \sum_i \frac{\left(Y_{obs}^i - Y_{mod}^i \right)^2}{N_i} \cdot w_i,} where the sum runs over all \eqn{n} observations, \eqn{Y_{obs}^i} and \eqn{Y_{mod}^i} are observed and modeled values of variable \eqn{Y_i}, - \eqn{N_i} is a normalization factor that generally depends on the name - of \eqn{Y_i} and the set of drivers associated with the \eqn{i^{th}} - observation, and \eqn{w_i} is a weight factor that generally depends - on the name of \eqn{Y}. + \eqn{N_i} is a normalization factor that depends on the name of + \eqn{Y_i} and the data set that includes the \eqn{i^{th}} observation, + and \eqn{w_i} is a weight factor that depends on the name of \eqn{Y}. + + Each value of \eqn{Y_{obs}^i} is specified at a particular time + \eqn{t_i}. The corresponding modeled value, \eqn{Y_{mod}^i}, is found + by retrieving the value of the \eqn{Y_i} variable at the closest time + to \eqn{t_i} that is included in the model output. It is assumed that + the model always outputs the same sequence of time values each time it + is run with a particular set of drivers, regardless of the input + argument values. The normalization factors \eqn{N_i} are determined by the choice of \code{normalization_method}; the available methods are discussed @@ -232,7 +239,7 @@ \eqn{P_{user}} is calculated by applying a function \eqn{f_{user}} to the full simulation output from each set of drivers. In other words, - \deqn{P_{user} = \sum_k f \left( M_k \right),} + \deqn{P_{user} = \sum_k f_{user} \left( M_k \right),} where the sum runs over all \eqn{k} sets of drivers and \eqn{M_k} is the model output when it is run with the \eqn{k^{th}} set of drivers. @@ -260,22 +267,21 @@ The following normalization methods are available: \itemize{ - \item \strong{none:} For this method, \eqn{N_i} is always set to 1. In other + \item \code{'none'}: For this method, \eqn{N_i} is always set to 1. In other words, no normalization is performed. - \item \strong{mean:} For this method, when \eqn{Y_i} is named \code{vtype} - and the observation is associated with a set of drivers called - \code{vdrivers}, then + \item \code{'mean'}: For this method, when \eqn{Y_i} is named \code{vtype} + and the observation is from a set called \code{vdata}, then - \deqn{N_i = n_{vtype}^{vdrivers},} - where \eqn{n_{vtype}^{vdrivers}} is the number of observations of type - \code{vtype} that are associated with drivers \code{vdrivers}. In this - case, the error term \eqn{E_{agreement}} becomes a mean error for each - set of drivers, hence the name for this method. This approach avoids + \deqn{N_i = n_{vtype}^{vdata},} + where \eqn{n_{vtype}^{vdata}} is the number of observations of type + \code{vtype} that are included in \code{vdata}. In this case, the + error term \eqn{E_{agreement}} becomes a mean error for each set of + drivers, hence the name for this method. This approach avoids over-representing drivers with larger numbers of associated observations when determining \eqn{E_{agreement}}. - \item \strong{max:} For this method, when \eqn{Y_i} is named \code{vtype} + \item \code{'max'}: For this method, when \eqn{Y_i} is named \code{vtype} and the observation is from a set called \code{vdata}, then \deqn{N_i = \left( max_{vtype}^{vdata} \right)^2,} @@ -286,7 +292,7 @@ method. This approach avoids over-representing variable types with larger magnitude when determining \eqn{E_{agreement}}. - \item \strong{mean_max:} For this method, the "mean" and "max" methods are + \item \code{'mean_max'}: For this method, the "mean" and "max" methods are combined so that \deqn{N_i =% @@ -297,9 +303,67 @@ recommended for most situations. } - When - \code{regularization_method} is \code{'none'}, \eqn{P_{regularization}} is - zero. + \strong{Regularization methods} + + The following regularization methods are available: + + \itemize{ + \item \code{'none'}: For this method, \eqn{P_{regularization}} is always set + to 0. In other words, no regularization is performed. + + \item \code{'L1'} or \code{'lasso'}: For this method, + \eqn{P_{regularization}} is given by the sum of the absolute values of + each independent argument: + + \deqn{P_{regularization} = \sum_j | X_j |,} + where the sum runs over all \eqn{j} independent arguments, and + \eqn{X_j} is the value of the \eqn{j^{th}} argument. + + \item \code{'L2'} or \code{'ridge'}: For this method, + \eqn{P_{regularization}} is given by the sum of the squared values of + each independent argument: + + \deqn{P_{regularization} = \sum_j X_j^2,} + where the sum runs over all \eqn{j} independent arguments, and + \eqn{X_j} is the value of the \eqn{j^{th}} argument. + } + + \strong{Input checks} + + Several checks are made to ensure that the objective function is properly + defined. These checks include, but are not limited to, the following: + + \itemize{ + \item Ensuring that each set of drivers in \code{data_driver_pairs} defines + a valid dynamical system along with the \code{base_model_definition}. + This is accomplished using + \code{\link[BioCro]{validate_dynamical_system_inputs}}. + + \item Ensuring that the model output corresponding to each set of drivers + spans the times at which the observations were made. + + \item Ensuring that each variable type in the data elements of + \code{data_driver_pairs} matches a corresponding column in the model + output, when accounting for the \code{data_definitions} and + \code{post_process_function}. + + \item Ensuring that each independent and dependent argument name is either + a parameter or initial value of the model. This is accomplished using + \code{\link[BioCro]{partial_run_biocro}}. Note that argument names + passed to \code{partial_run_biocro} can technically include drivers, + but it is unlikely that the value of a driver would be varied during + an optimization. + + \item Ensuring that the optional \code{dependent_arg_function}, + \code{post_process_function}, and \code{extra_penalty_function} + functions can be run without causing errors. + } + + If any issues are detected, an informative error message will be sent. Note + that several of these checks require running the model with each set of + drivers. For these checks, the argument values specified by + \code{independent_args} will be used, so they should be valid or otherwise + reasonable values. } \value{ @@ -418,11 +482,15 @@ if (require(BioCro)) { # evaluate it for two sets of parameter values. # First try the initial values. - obj_fun(as.numeric(independent_args)) + print( + obj_fun(as.numeric(independent_args)) + ) # Now try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. - obj_fun(2 * as.numeric(independent_args)) + print( + obj_fun(2 * as.numeric(independent_args)) + ) } } From a828a54ba89782e40a85aae8804bf705e1a84b4c Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 14:20:03 -0500 Subject: [PATCH 19/54] Don't allow drivers to be varied --- R/objective_function.R | 8 +++-- R/objective_function_helpers.R | 25 +++++++++----- R/objective_function_input_checks.R | 42 ++++++++++++++++++++++-- man/objective_function.Rd | 3 +- tests/testthat/test-objective_function.R | 28 +++++++++++++++- 5 files changed, 92 insertions(+), 14 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 9780036..588cea3 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -17,8 +17,12 @@ objective_function <- function( # Check the data-driver pairs check_data_driver_pairs(base_model_definition, data_driver_pairs) - # Check the independent arguments - check_independent_arguments(independent_args) + # Check the arguments to be varied + check_args_to_vary( + independent_args, + dependent_arg_function, + data_driver_pairs + ) # Get the model runners model_runners <- lapply(data_driver_pairs, function(ddp) { diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 603010b..5947803 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -1,6 +1,22 @@ ## All the functions defined in this file are intended to perform key operations ## required by `objective_function`. +# Helping function for getting a full list of argument names +get_full_arg_names <- function(independent_args, dependent_arg_function) { + # Get the independent argument names + independent_arg_names <- names(independent_args) + + # Get the full list of arg_names + if (is.null(dependent_arg_function)) { + independent_arg_names + } else { + dependent_arg_values <- + dependent_arg_function(independent_args) + + c(independent_arg_names, names(dependent_arg_values)) + } +} + # Helping function for getting a model runner; if the runner cannot be created, # an error message will be returned instead get_model_runner <- function( @@ -15,14 +31,7 @@ get_model_runner <- function( independent_arg_names <- names(independent_args) # Get the full list of arg_names - arg_names <- if (is.null(dependent_arg_function)) { - independent_arg_names - } else { - dependent_arg_values <- - dependent_arg_function(independent_args) - - c(independent_arg_names, names(dependent_arg_values)) - } + arg_names <- get_full_arg_names(independent_args, dependent_arg_function) # Build the runner tryCatch({ diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 77c5381..43e7fb6 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -67,11 +67,49 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { } # Helping function for checking the independent arguments -check_independent_arguments <- function(independent_args) { - if (is.null(names(independent_args))) { +check_args_to_vary <- function( + independent_args, + dependent_arg_function, + data_driver_pairs +) +{ + # Make sure the independent arguments have names + ind_arg_names <- names(independent_args) + + if (is.null(ind_arg_names)) { stop('`independent_args` must have names') } + # Make sure the dependent argument function returns a named list + if (!is.null(dependent_arg_function)) { + dep_arg_names <- + names(dependent_arg_function(independent_args)) + + if (is.null(dep_arg_names)) { + stop('The return value of `dependent_arg_function` must have names') + } + } + + # Make sure no drivers were specified + arg_names <- get_full_arg_names(independent_args, dependent_arg_function) + + args_in_drivers <- lapply(data_driver_pairs, function(ddp) { + driver_names <- names(ddp[['drivers']]) + arg_names[arg_names %in% driver_names] + }) + + args_in_drivers <- unique(unlist(args_in_drivers)) + + if (length(args_in_drivers) > 0) { + msg <- paste( + 'Some independent or dependent argument names refer to columns', + 'in the drivers:', + paste(args_in_drivers, collapse = ', ') + ) + + stop(msg) + } + return(invisible(NULL)) } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 3db273f..29594da 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -352,7 +352,8 @@ \code{\link[BioCro]{partial_run_biocro}}. Note that argument names passed to \code{partial_run_biocro} can technically include drivers, but it is unlikely that the value of a driver would be varied during - an optimization. + an optimization, so the argument names are not allowed to include + columns in the drivers. \item Ensuring that the optional \code{dependent_arg_function}, \code{post_process_function}, and \code{extra_penalty_function} diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 2b3c697..518dc1d 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -100,7 +100,7 @@ test_that('Bad definitions are detected', { ) }) -test_that('Independent arguments must have names', { +test_that('Independent and dependent arguments must have names', { expect_error( objective_function( model, @@ -112,6 +112,32 @@ test_that('Independent arguments must have names', { ), '`independent_args` must have names' ) + + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + dependent_arg_function = function(x) {1.0} + ), + 'The return value of `dependent_arg_function` must have names' + ) + + expect_error( + objective_function( + model, + ddps, + c(independent_args, list(solar = 1000)), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + dependent_arg_function = function(x) {list(precip = 0.1)} + ), + 'Some independent or dependent argument names refer to columns in the drivers: solar, precip' + ) }) test_that('Bad argument names are detected', { From 1d010a46215d24875db2fd0670ba9cc4ca842717 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 14:46:27 -0500 Subject: [PATCH 20/54] Add ability to return error metric terms --- R/objective_function_helpers.R | 33 +++++++++++++++++++----- man/objective_function.Rd | 27 +++++++++++++++---- tests/testthat/test-objective_function.R | 8 +++++- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 5947803..6c4d24d 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -234,7 +234,8 @@ error_from_res <- function( long_form_data_table, quantity_weights, normalization_method, - extra_penalty_function + extra_penalty_function, + return_terms ) { # If the simulation did not finish, return a very high value @@ -265,8 +266,15 @@ error_from_res <- function( one_error(obs, pred, wt, norm) }) - # Return the sum of the penalty and error terms - penalty + sum(errors) + # Return the sum of the penalty and error terms, or the individual errors + if (return_terms) { + list( + least_squares_term = sum(errors), + extra_penalty = penalty + ) + } else { + penalty + sum(errors) + } } # Helping function for calculating a regularization penalty term @@ -297,8 +305,8 @@ get_obj_fun <- function( regularization_method ) { - function(x, lambda = 0) { - errors <- sapply(seq_along(model_runners), function(i) { + function(x, lambda = 0, return_terms = FALSE) { + errors <- lapply(seq_along(model_runners), function(i) { runner <- model_runners[[i]] res <- runner(x) @@ -307,12 +315,23 @@ get_obj_fun <- function( long_form_data[[i]], processed_weights, normalization_method, - extra_penalty_function + extra_penalty_function, + return_terms ) }) reg_penalty <- regularization_penalty(x, regularization_method, lambda) - sum(errors) + if (return_terms) { + list( + terms_from_data_driver_pairs = stats::setNames( + errors, + names(model_runners) + ), + regularization_penalty = reg_penalty + ) + } else { + sum(as.numeric(errors)) + reg_penalty + } } } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 29594da..df25d92 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -368,11 +368,23 @@ } \value{ - A function \code{obj_fun} with signature \code{obj_fun(x, lambda = 0)}. Here, - \code{x} is a numeric vector of values of the independent arguments (in the - same order as in \code{independent_arg_names}), and \code{lambda} is the value - of the regularization parameter. The \code{obj_fun} function returns values of - the error metric, as described above. + A function \code{obj_fun} with signature + \code{obj_fun(x, lambda = 0, return_terms = FALSE)}. + + Here, \code{x} is a numeric vector of values of the independent arguments (in + the same order as in \code{independent_arg_names}), and \code{lambda} is the + value of the regularization parameter. + + The \code{return_terms} argument determines the return value of + \code{obj_fun}. When \code{return_terms} is \code{FALSE}, \code{obj_fun} + returns values of the error metric \eqn{E}. When \code{return_terms} is + \code{TRUE}, \code{obj_fun} returns a list including each individual term of + the total error metric. + + During optimization, \code{return_terms} should always be \code{FALSE}. + Setting it to \code{TRUE} can be useful for troubleshooting, or for + diagnostics such as checking the quality of fit for each of the data-driver + pairs. } \examples{ @@ -493,5 +505,10 @@ if (require(BioCro)) { print( obj_fun(2 * as.numeric(independent_args)) ) + + # We can also see the values of each term that makes up the error metric + str( + obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) + ) } } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 518dc1d..ea2dc8a 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -82,7 +82,13 @@ test_that('Objective functions can be created and behave as expected', { ) expect_silent( - obj_fun(as.numeric(independent_args), 0.5) + obj_fun(as.numeric(independent_args), lambda = 0.5) + ) + + expect_true( + is.list( + obj_fun(as.numeric(independent_args), lambda = 0.5, return_terms = TRUE) + ) ) }) From a5d4ac9d8eb1449b22955a1646abff3375364e71 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 14:54:58 -0500 Subject: [PATCH 21/54] Add verbose option --- R/objective_function.R | 5 +++-- R/objective_function_input_checks.R | 12 ++++++++++-- man/objective_function.Rd | 18 +++++++++++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 588cea3..1c4966a 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -11,7 +11,8 @@ objective_function <- function( dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, - regularization_method = 'none' + regularization_method = 'none', + verbose_startup = FALSE ) { # Check the data-driver pairs @@ -78,7 +79,7 @@ objective_function <- function( ) # Check the objective function - check_obj_fun(obj_fun, independent_args) + check_obj_fun(obj_fun, independent_args, verbose_startup) # Return it obj_fun diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 43e7fb6..31620f2 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -229,8 +229,16 @@ check_runner_results <- function( # Helping function for checking the objective function; will throw an error if a # problem is detected, and will otherwise be silent with no return value. -check_obj_fun <- function(obj_fun, initial_ind_arg_values) { - initial_error <- obj_fun(as.numeric(initial_ind_arg_values)) +check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { + initial_error_terms <- + obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) + + if (verbose) { + cat('\nThe initial error metric terms are:\n') + utils::str(initial_error_terms) + } + + initial_error <- sum(unlist(initial_error_terms)) if (!is.finite(initial_error)) { stop( diff --git a/man/objective_function.Rd b/man/objective_function.Rd index df25d92..57de202 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -35,7 +35,8 @@ dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, - regularization_method = 'none' + regularization_method = 'none', + verbose_startup = FALSE ) } @@ -112,6 +113,12 @@ A string indicating the regularization method to be used when calculating the regularization penalty term; see below for more details. } + + \item{verbose_startup}{ + A logical (\code{TRUE} or \code{FALSE}) value indicating whether to print + additional information to the R terminal when creating the objective + function. + } } \details{ @@ -480,7 +487,8 @@ if (require(BioCro)) { } } - # Now we can finally create the objective function. + # Now we can finally create the objective function; here we choose to print + # extra startup information, but this is not strictly required. obj_fun <- objective_function( base_model_definition, data_driver_pairs, @@ -488,13 +496,15 @@ if (require(BioCro)) { quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, - extra_penalty_function = extra_penalty_function + extra_penalty_function = extra_penalty_function, + verbose_startup = TRUE ) # This function could now be passed to an optimizer; here we will simply # evaluate it for two sets of parameter values. # First try the initial values. + cat('\nError metric calculated from initial argument values:\n') print( obj_fun(as.numeric(independent_args)) ) @@ -502,11 +512,13 @@ if (require(BioCro)) { # Now try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. + cat('\nError metric calculated by doubling the initial argument values:\n') print( obj_fun(2 * as.numeric(independent_args)) ) # We can also see the values of each term that makes up the error metric + cat('\nError metric terms calculated by doubling the initial argument values:\n') str( obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) ) From 2bd73884cfe9b6280ceb4cee1695f1b0715b3016 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 15:03:54 -0500 Subject: [PATCH 22/54] Mention lambda in documentation --- man/objective_function.Rd | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 57de202..7a659f7 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -261,7 +261,7 @@ arguments being varied during the optimization by applying a function \eqn{R}. In other words, - \deqn{E_{regularization} = R \left( X \right),} + \deqn{P_{regularization} = R \left( X \right),} where \eqn{X} represents the model argument values. The function \eqn{R} is determined by the choice of @@ -320,19 +320,23 @@ \item \code{'L1'} or \code{'lasso'}: For this method, \eqn{P_{regularization}} is given by the sum of the absolute values of - each independent argument: + each independent argument, multiplied by a "regularization parameter" + \eqn{\lambda} that sets the overall weight of the penalty: \deqn{P_{regularization} = \sum_j | X_j |,} where the sum runs over all \eqn{j} independent arguments, and - \eqn{X_j} is the value of the \eqn{j^{th}} argument. + \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" + section below for details of how to specify \eqn{\lambda}. \item \code{'L2'} or \code{'ridge'}: For this method, \eqn{P_{regularization}} is given by the sum of the squared values of - each independent argument: + each independent argument, multiplied by a "regularization parameter" + \eqn{\lambda} that sets the overall weight of the penalty: \deqn{P_{regularization} = \sum_j X_j^2,} where the sum runs over all \eqn{j} independent arguments, and - \eqn{X_j} is the value of the \eqn{j^{th}} argument. + \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" + section below for details of how to specify \eqn{\lambda}. } \strong{Input checks} From c44ac52fc998a0bd909bfe5eceb7cb3d152e4212 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 15:04:53 -0500 Subject: [PATCH 23/54] Update NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1aaf1d6..8141b4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,6 +37,8 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. +- Added the `objective_function` function. + # Changes in BioCroValidation Version 0.1.0 - This is the first version of BioCroValidation. At this point, the package is From cdfa72448825511912b8ecc88b6b7956ee4d1f20 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 6 May 2025 18:19:28 -0500 Subject: [PATCH 24/54] Fix pkgdown math formatting And add some forgotten lambdas --- _pkgdown.yml | 1 + man/objective_function.Rd | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 76ebccb..27d10ba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,5 +1,6 @@ url: https://biocro.github.io/BioCroValidation/ template: + math-rendering: mathjax bootstrap: 5 bootswatch: flatly theme: arrow-light diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 7a659f7..d3a4c0b 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -323,7 +323,7 @@ each independent argument, multiplied by a "regularization parameter" \eqn{\lambda} that sets the overall weight of the penalty: - \deqn{P_{regularization} = \sum_j | X_j |,} + \deqn{P_{regularization} = \lambda \sum_j | X_j |,} where the sum runs over all \eqn{j} independent arguments, and \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" section below for details of how to specify \eqn{\lambda}. @@ -333,7 +333,7 @@ each independent argument, multiplied by a "regularization parameter" \eqn{\lambda} that sets the overall weight of the penalty: - \deqn{P_{regularization} = \sum_j X_j^2,} + \deqn{P_{regularization} = \lambda \sum_j X_j^2,} where the sum runs over all \eqn{j} independent arguments, and \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" section below for details of how to specify \eqn{\lambda}. From 95b340d12dd92e11c78d76ad090ada88405c8b27 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 8 May 2025 16:27:41 -0500 Subject: [PATCH 25/54] Document the failure return value --- R/objective_function_helpers.R | 32 +++++++++++++++++++++++++++++--- man/objective_function.Rd | 6 ++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 6c4d24d..30f1878 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -1,6 +1,9 @@ ## All the functions defined in this file are intended to perform key operations ## required by `objective_function`. +# Value to return when a simulation fails to run +FAILURE_VALUE <- 1e10 + # Helping function for getting a full list of argument names get_full_arg_names <- function(independent_args, dependent_arg_function) { # Get the independent argument names @@ -228,6 +231,18 @@ one_error <- function(observed, predicted, weight, normalization) { (observed - predicted)^2 * weight_multiplier / normalization } +# Helping function for returning a failure value +failure_value <- function(error_sum, return_terms) { + if (return_terms) { + list( + least_squares_term = error_sum, + extra_penalty = FAILURE_VALUE + ) + } else { + FAILURE_VALUE + } +} + # Helping function that calculates an error value from a simulation result error_from_res <- function( simulation_result, @@ -242,7 +257,9 @@ error_from_res <- function( expected_npts <- long_form_data_table[1, 'expected_npts'] if (nrow(simulation_result) < expected_npts) { - return(1e6) + return( + failure_value(NA, return_terms) + ) } # Calculate any user-specified penalties @@ -266,14 +283,23 @@ error_from_res <- function( one_error(obs, pred, wt, norm) }) + error_sum <- sum(errors) + + # If the error sum is not finite, return a very high value + if (!is.finite(error_sum)) { + return( + failure_value(error_sum, return_terms) + ) + } + # Return the sum of the penalty and error terms, or the individual errors if (return_terms) { list( - least_squares_term = sum(errors), + least_squares_term = error_sum, extra_penalty = penalty ) } else { - penalty + sum(errors) + penalty + error_sum } } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index d3a4c0b..eb8ece3 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -242,6 +242,12 @@ underprediction or an overprediction: \eqn{w_i = w_{under}} when \eqn{Y_{mod}^i < Y_{obs}^i} and \eqn{w_i = w_{over}} otherwise. + There are a few special cases where \eqn{E_{agreement}} is set to a + very high value (\code{BioCroValidation:::FAILURE_VALUE}). This is + done when a simulation fails to run, when the \eqn{E_{agreement}} term + would otherwise evaluate to \code{NA}, or when the \eqn{E_{agreement}} + term would otherwise evaluate to an infinite value. + \item \strong{User-defined penalty term:} The user-defined penalty term \eqn{P_{user}} is calculated by applying a function \eqn{f_{user}} to the full simulation output from each set of drivers. In other words, From d6902a487547e1fd5b5c9927e1be5e544e118bae Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 8 May 2025 16:45:51 -0500 Subject: [PATCH 26/54] Add more verbose output --- R/objective_function.R | 15 +++++++++++++++ R/objective_function_input_checks.R | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/objective_function.R b/R/objective_function.R index 1c4966a..25b7f53 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -40,6 +40,11 @@ objective_function <- function( full_data_definitions <- get_data_definition_list(data_driver_pairs, data_definitions) + if (verbose_startup) { + cat('\nThe full data definitions:\n\n') + str(full_data_definitions) + } + # Check the model runners check_runners(model_runners) @@ -64,10 +69,20 @@ objective_function <- function( # Add normalization factors long_form_data <- add_norm(long_form_data, normalization_method) + if (verbose_startup) { + cat('\nThe user-supplied data in long form:\n\n') + print(long_form_data) + } + # Process the quantity weights processed_weights <- process_quantity_weights(quantity_weights, long_form_data) + if (verbose_startup) { + cat('\nThe user-supplied weights:\n\n') + str(processed_weights) + } + # Create the objective function obj_fun <- get_obj_fun( model_runners, diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 31620f2..75838e4 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -234,7 +234,7 @@ check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) if (verbose) { - cat('\nThe initial error metric terms are:\n') + cat('\nThe initial error metric terms:\n') utils::str(initial_error_terms) } From 578c1d4e35328d2dd272c8ecdb346b58cd4156ba Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 8 May 2025 17:33:37 -0500 Subject: [PATCH 27/54] Enable year-specific weights --- R/objective_function.R | 19 +++++++++---- R/objective_function_helpers.R | 35 ++++++++++++++++++------ R/objective_function_input_checks.R | 14 ++++++---- man/objective_function.Rd | 9 ++++-- tests/testthat/test-objective_function.R | 9 ++++-- 5 files changed, 61 insertions(+), 25 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 25b7f53..fba2a3d 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -42,7 +42,7 @@ objective_function <- function( if (verbose_startup) { cat('\nThe full data definitions:\n\n') - str(full_data_definitions) + utils::str(full_data_definitions) } # Check the model runners @@ -75,19 +75,28 @@ objective_function <- function( } # Process the quantity weights - processed_weights <- + full_quantity_weights <- process_quantity_weights(quantity_weights, long_form_data) if (verbose_startup) { - cat('\nThe user-supplied weights:\n\n') - str(processed_weights) + cat('The user-supplied quantity weights:\n\n') + utils::str(full_quantity_weights) + } + + # Get the data-driver pair weights + ddp_weights <- get_ddp_weights(data_driver_pairs) + + if (verbose_startup) { + cat('\nThe user-supplied data-driver pair weights:\n\n') + utils::str(ddp_weights) } # Create the objective function obj_fun <- get_obj_fun( model_runners, long_form_data, - processed_weights, + full_quantity_weights, + ddp_weights, normalization_method, extra_penalty_function, regularization_method diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 30f1878..23a63bd 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -220,15 +220,29 @@ process_quantity_weights <- function(quantity_weights, long_form_data) { }) } +# Helping function for getting the data-driver pair weights +get_ddp_weights <- function(data_driver_pairs) { + lapply(data_driver_pairs, function(ddp) { + ddp[['weight']] + }) +} + # Helping function that calculates one error -one_error <- function(observed, predicted, weight, normalization) { - weight_multiplier <- if (predicted < observed) { - weight[1] # Underprediction +one_error <- function( + observed, + predicted, + quantity_weight, + ddp_weight, + normalization +) +{ + qw <- if (predicted < observed) { + quantity_weight[1] # Underprediction } else { - weight[2] # Overprediction + quantity_weight[2] # Overprediction } - (observed - predicted)^2 * weight_multiplier / normalization + (observed - predicted)^2 * qw * ddp_weight / normalization } # Helping function for returning a failure value @@ -248,6 +262,7 @@ error_from_res <- function( simulation_result, long_form_data_table, quantity_weights, + ddp_weight, normalization_method, extra_penalty_function, return_terms @@ -277,10 +292,10 @@ error_from_res <- function( obs <- long_form_data_table[i, 'quantity_value'] indx <- long_form_data_table[i, 'time_index'] pred <- simulation_result[indx, qname] - wt <- quantity_weights[[qname]] + qt_wt <- quantity_weights[[qname]] norm <- long_form_data_table[i, 'norm'] - one_error(obs, pred, wt, norm) + one_error(obs, pred, qt_wt, ddp_weight, norm) }) error_sum <- sum(errors) @@ -325,7 +340,8 @@ regularization_penalty <- function( get_obj_fun <- function( model_runners, long_form_data, - processed_weights, + full_quantity_weights, + ddp_weights, normalization_method, extra_penalty_function, regularization_method @@ -339,7 +355,8 @@ get_obj_fun <- function( error_from_res( res, long_form_data[[i]], - processed_weights, + full_quantity_weights, + ddp_weights[[i]], normalization_method, extra_penalty_function, return_terms diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 75838e4..8ec552b 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -9,16 +9,20 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { stop('`data_driver_pairs` must have names') } + required_elements <- c('drivers', 'data', 'weight') + has_elements <- sapply(data_driver_pairs, function(x) { - 'drivers' %in% names(x) && 'data' %in% names(x) + all(required_elements %in% names(x)) }) if (any(!has_elements)) { missing_elements <- names(data_driver_pairs)[!has_elements] - msg <- paste( - 'The following data-driver pairs are missing a `drivers` element,', - 'a `data` element, or both:', + msg <- paste0( + 'The following data-driver pairs are missing at least one ', + 'required element (', + paste(required_elements, collapse = ', '), + '): ', paste(missing_elements, collapse = ', ') ) @@ -234,7 +238,7 @@ check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) if (verbose) { - cat('\nThe initial error metric terms:\n') + cat('\nThe initial error metric terms:\n\n') utils::str(initial_error_terms) } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index eb8ece3..71e08b4 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -433,15 +433,18 @@ if (require(BioCro)) { }) # The data-driver pairs can now be created by associating each data set with - # its corresponding weather data. + # its corresponding weather data. Here we will weight the 2005 data twice as + # heavily as the 2002 data. data_driver_pairs <- list( ambient_2002 = list( data = ambient_2002_data, - drivers = BioCro::soybean_weather[['2002']] + drivers = BioCro::soybean_weather[['2002']], + weight = 1 ), ambient_2005 = list( data = ambient_2005_data, - drivers = BioCro::soybean_weather[['2005']] + drivers = BioCro::soybean_weather[['2005']], + weight = 2 ) ) diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index ea2dc8a..259676c 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -5,11 +5,13 @@ model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] ddps <- list( ambient_2002 = list( data = within(soyface_biomass[['ambient_2002']], {time = (DOY - 1) * 24.0; DOY = NULL}), - drivers = BioCro::soybean_weather[['2002']] + drivers = BioCro::soybean_weather[['2002']], + weight = 1 ), ambient_2005 = list( data = within(soyface_biomass[['ambient_2005']], {time = (DOY - 1) * 24.0; DOY = NULL}), - drivers = BioCro::soybean_weather[['2005']] + drivers = BioCro::soybean_weather[['2005']], + weight = 2 ) ) @@ -190,7 +192,8 @@ test_that('Data-driver pairs must be complete', { data_definitions = data_definitions, post_process_function = post_process_function ), - 'The following data-driver pairs are missing a `drivers` element, a `data` element, or both: ambient_2002, ambient_2005' + 'The following data-driver pairs are missing at least one required element (drivers, data, weight): ambient_2002, ambient_2005', + fixed = TRUE ) }) From 63a8277494c4c193e6dbb679a5c83ae70b734562 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 09:40:01 -0500 Subject: [PATCH 28/54] Normalize by number of data-driver pairs --- R/objective_function.R | 6 +++++- R/objective_function_helpers.R | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index fba2a3d..b618710 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -67,7 +67,11 @@ objective_function <- function( long_form_data <- add_time_indices(initial_runner_res, long_form_data) # Add normalization factors - long_form_data <- add_norm(long_form_data, normalization_method) + long_form_data <- add_norm( + long_form_data, + normalization_method, + length(data_driver_pairs) + ) if (verbose_startup) { cat('\nThe user-supplied data in long form:\n\n') diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 23a63bd..659df31 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -156,7 +156,7 @@ add_time_indices <- function(initial_runner_res, long_form_data) { } # Helping function for getting normalization factors -add_norm <- function(long_form_data, normalization_method) { +add_norm <- function(long_form_data, normalization_method, n_ddp) { for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] @@ -169,13 +169,13 @@ add_norm <- function(long_form_data, normalization_method) { if (tolower(normalization_method) == 'none') { 1.0 } else if (tolower(normalization_method) == 'mean') { - nrow(qname_subset) + nrow(qname_subset) * n_ddp } else if (tolower(normalization_method) == 'max') { max(qname_subset[['quantity_value']])^2 } else if (tolower(normalization_method) == 'mean_max') { npts <- nrow(qname_subset) qmax <- max(qname_subset[['quantity_value']]) - npts * qmax^2 + npts * n_ddp * qmax^2 } else { stop('Unsupported normalization_method: ', normalization_method) } From af4e40c72c15b9aab8bee7213301f1b4bb3860c0 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 10:33:08 -0500 Subject: [PATCH 29/54] Enable stdev as an optional part of the data-driver pairs --- R/objective_function_helpers.R | 16 ++++++-- R/objective_function_input_checks.R | 64 +++++++++++++++++++++++++++++ man/objective_function.Rd | 35 ++++++++-------- 3 files changed, 95 insertions(+), 20 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 659df31..c647b30 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -114,19 +114,29 @@ get_data_definition_list <- function(data_driver_pairs, user_data_definitions) data_definitions } -# Helping function for converting each data table to a "long form." +# Helping function for converting each data table to a "long form," including +# stdev values when available get_long_form_data <- function(data_driver_pairs, full_data_definitions) { lapply(data_driver_pairs, function(ddp) { short_form_data <- ddp[['data']] + has_std <- 'data_stdev' %in% names(ddp) + + short_form_stdev <- if (has_std) { + ddp[['data_stdev']] + } else { + NA + } + data_column_names <- colnames(short_form_data) data_column_names <- data_column_names[data_column_names != 'time'] long_form_data_list <- lapply(data_column_names, function(cn) { data.frame( - time = short_form_data[, 'time'], - quantity_name = full_data_definitions[[cn]], + time = short_form_data[, 'time'], + quantity_name = full_data_definitions[[cn]], quantity_value = short_form_data[, cn], + quantity_stdev = if (has_std) {short_form_stdev[, cn]} else {1}, stringsAsFactors = FALSE ) }) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 8ec552b..e346995 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -5,10 +5,12 @@ # Helping function for checking the data-driver pairs; will throw an error if # a problem is detected, and will otherwise be silent with no return value. check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { + # Data-driver pairs must have names if (is.null(names(data_driver_pairs))) { stop('`data_driver_pairs` must have names') } + # Each data-driver pair must have the required elements required_elements <- c('drivers', 'data', 'weight') has_elements <- sapply(data_driver_pairs, function(x) { @@ -29,6 +31,29 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { stop(msg) } + # Only required or optional elements should be provided + optional_elements <- 'data_stdev' + + acceptable_elements <- c(required_elements, optional_elements) + + has_extra_elements <- sapply(data_driver_pairs, function(x) { + any(!names(x) %in% acceptable_elements) + }) + + if (any(has_extra_elements)) { + bad_elements <- names(data_driver_pairs)[has_extra_elements] + + msg <- paste0( + 'The following data-driver pairs have unexpected elements: ', + paste(bad_elements, collapse = ', '), + '. The allowed elements are: ', + paste(acceptable_elements, collapse = ', ') + ) + + stop(msg) + } + + # Each data table must have a time column has_time <- sapply(data_driver_pairs, function(x) { 'time' %in% colnames(x[['data']]) }) @@ -45,6 +70,45 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { stop(msg) } + # If provided, stdev tables must have the same columns and time values as + # their corresponding data tables. Time values must also be in the same + # order. + stdev_okay <- sapply(data_driver_pairs, function(x) { + if ('data_stdev' %in% colnames(x)) { + data_table <- x[['data']] + stdev_table <- x[['data_stdev']] + + colnames_match <- identical( + sort(colnames(data_table)), + sort(colnames(stdev_table)) + ) + + times_match <- isTRUE(all.equal( + data_table[['time']], + stdev_table[['time']] + )) + + colnames_match && times_match + } else { + TRUE + } + }) + + if (any(!stdev_okay)) { + bad_stdev <- names(data_driver_pairs)[!stdev_okay] + + msg <- paste( + 'The following data-driver pairs have a `data_stdev` element', + 'that does not match the columns and/or times of their', + '`data` element:', + paste(bad_stdev, collapse = ', ') + ) + + stop(msg) + } + + # Each set of drivers must form a valid dynamical system along with the + # base model definition valid_definitions <- sapply(data_driver_pairs, function(ddp) { BioCro::validate_dynamical_system_inputs( base_model_definition[['initial_values']], diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 71e08b4..d7c069e 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -420,31 +420,32 @@ if (require(BioCro)) { # We will use the `soyface_biomass` data set (included with the # `BioCroValidation` package) for the observed values; this set includes # observations of leaf, stem, and pod biomass from two years, which are stored - # in two data tables. However, these data tables each have a `DOY` column - # rather than a `time` column, so we need to alter them. - ambient_2002_data <- within(soyface_biomass[['ambient_2002']], { - time = (DOY - 1) * 24.0 # Define new `time` column - DOY = NULL # Remove unneeded `DOY` column - }) - - ambient_2005_data <- within(soyface_biomass[['ambient_2005']], { - time = (DOY - 1) * 24.0 # Define new `time` column - DOY = NULL # Remove unneeded `DOY` column - }) + # in two data tables; it also includes the standard deviations of the measured + # biomasses, which are included in two separate tables. However, these data + # tables each have a `DOY` column rather than a `time` column, so we need to + # alter them. Here we will do this by defining a short helper function. + convert_time <- function(x) { + within(x, { + time = (DOY - 1) * 24.0 # Define new `time` column + DOY = NULL # Remove unneeded `DOY` column + }) + } # The data-driver pairs can now be created by associating each data set with # its corresponding weather data. Here we will weight the 2005 data twice as # heavily as the 2002 data. data_driver_pairs <- list( ambient_2002 = list( - data = ambient_2002_data, - drivers = BioCro::soybean_weather[['2002']], - weight = 1 + data = convert_time(soyface_biomass[['ambient_2002']]), + data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + drivers = BioCro::soybean_weather[['2002']], + weight = 1 ), ambient_2005 = list( - data = ambient_2005_data, - drivers = BioCro::soybean_weather[['2005']], - weight = 2 + data = convert_time(soyface_biomass[['ambient_2005']]), + data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]), + drivers = BioCro::soybean_weather[['2005']], + weight = 2 ) ) From de53a8326632e7fe6c5a21d161ac6bd2b041bdaf Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 11:16:47 -0500 Subject: [PATCH 30/54] Add variance-based weights --- R/objective_function.R | 7 ++++ R/objective_function_helpers.R | 41 ++++++++++++++---- R/objective_function_input_checks.R | 32 +++++++++----- man/objective_function.Rd | 12 +++++- tests/testthat/test-objective_function.R | 53 +++++++++++++++++++++++- 5 files changed, 124 insertions(+), 21 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index b618710..e4ca11c 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -8,6 +8,7 @@ objective_function <- function( quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + variance_weight_method = 'equal', dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, @@ -73,6 +74,12 @@ objective_function <- function( length(data_driver_pairs) ) + # Add variance-based weights + long_form_data <- add_w_var( + long_form_data, + variance_weight_method + ) + if (verbose_startup) { cat('\nThe user-supplied data in long form:\n\n') print(long_form_data) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index c647b30..a9da812 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -176,7 +176,7 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { qname_subset <- data_table[data_table[['quantity_name']] == qname, ] - if (tolower(normalization_method) == 'none') { + if (tolower(normalization_method) == 'equal') { 1.0 } else if (tolower(normalization_method) == 'mean') { nrow(qname_subset) * n_ddp @@ -197,6 +197,29 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { long_form_data } +# Helping function for getting variance-based weights +add_w_var <- function(long_form_data, variance_weight_method) { + for (i in seq_along(long_form_data)) { + data_table <- long_form_data[[i]] + data_stdev <- data_table[['quantity_stdev']] + + data_table[['w_var']] <- + if (tolower(variance_weight_method) == 'equal') { + 1.0 + } else if (tolower(variance_weight_method) == 'logarithm') { + log(1 / (data_stdev + 1e-5)) + } else if (tolower(variance_weight_method) == 'inverse') { + 1 / data_stdev^2 + } else { + stop('Unsupported variance_weight_method: ', variance_weight_method) + } + + long_form_data[[i]] <- data_table + } + + long_form_data +} + # Helping function that processes and checks the quantity weights process_quantity_weights <- function(quantity_weights, long_form_data) { # First make sure that weights have been provided for all measured @@ -243,6 +266,7 @@ one_error <- function( predicted, quantity_weight, ddp_weight, + var_weight, normalization ) { @@ -252,7 +276,7 @@ one_error <- function( quantity_weight[2] # Overprediction } - (observed - predicted)^2 * qw * ddp_weight / normalization + (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization } # Helping function for returning a failure value @@ -299,13 +323,16 @@ error_from_res <- function( errors <- sapply(seq_len(n_obs), function(i) { qname <- as.character(long_form_data_table[i, 'quantity_name']) - obs <- long_form_data_table[i, 'quantity_value'] indx <- long_form_data_table[i, 'time_index'] - pred <- simulation_result[indx, qname] - qt_wt <- quantity_weights[[qname]] - norm <- long_form_data_table[i, 'norm'] - one_error(obs, pred, qt_wt, ddp_weight, norm) + one_error( + long_form_data_table[i, 'quantity_value'], # obs + simulation_result[indx, qname], # pred + quantity_weights[[qname]], # quantity_weight + ddp_weight, # ddp_weight + long_form_data_table[i, 'w_var'], # var_weight + long_form_data_table[i, 'norm'] # norm + ) }) error_sum <- sum(errors) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index e346995..44fa7d9 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -5,6 +5,11 @@ # Helping function for checking the data-driver pairs; will throw an error if # a problem is detected, and will otherwise be silent with no return value. check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { + # There must be at least one data-driver pair + if (length(data_driver_pairs) < 1) { + stop('`data_driver_pairs` must have at least one element') + } + # Data-driver pairs must have names if (is.null(names(data_driver_pairs))) { stop('`data_driver_pairs` must have names') @@ -47,7 +52,8 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { 'The following data-driver pairs have unexpected elements: ', paste(bad_elements, collapse = ', '), '. The allowed elements are: ', - paste(acceptable_elements, collapse = ', ') + paste(acceptable_elements, collapse = ', '), + '.' ) stop(msg) @@ -74,21 +80,25 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { # their corresponding data tables. Time values must also be in the same # order. stdev_okay <- sapply(data_driver_pairs, function(x) { - if ('data_stdev' %in% colnames(x)) { + if ('data_stdev' %in% names(x)) { data_table <- x[['data']] stdev_table <- x[['data_stdev']] - colnames_match <- identical( - sort(colnames(data_table)), - sort(colnames(stdev_table)) - ) + if (is.null(colnames(stdev_table))) { + FALSE + } else { + colnames_match <- identical( + sort(colnames(data_table)), + sort(colnames(stdev_table)) + ) - times_match <- isTRUE(all.equal( - data_table[['time']], - stdev_table[['time']] - )) + times_match <- isTRUE(all.equal( + data_table[['time']], + stdev_table[['time']] + )) - colnames_match && times_match + colnames_match && times_match + } } else { TRUE } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index d7c069e..259f885 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -32,6 +32,7 @@ quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + variance_weight_method = 'equal', dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, @@ -86,6 +87,11 @@ error metric; see below for more details. } + \item{variance_weight_method}{ + A string indicating the method to be used when calculating the + variance-based weights used in the error metric; see below for more details. + } + \item{dependent_arg_function}{ A function whose input argument is a named list of independent argument values, and which returns a named list of dependent argument values. If the @@ -280,8 +286,8 @@ The following normalization methods are available: \itemize{ - \item \code{'none'}: For this method, \eqn{N_i} is always set to 1. In other - words, no normalization is performed. + \item \code{'equal'}: For this method, \eqn{N_i} is always set to 1. In + other words, no normalization is performed. \item \code{'mean'}: For this method, when \eqn{Y_i} is named \code{vtype} and the observation is from a set called \code{vdata}, then @@ -509,6 +515,8 @@ if (require(BioCro)) { independent_args, quantity_weights, data_definitions = data_definitions, + variance_weight_method = 'logarithm', + dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, extra_penalty_function = extra_penalty_function, verbose_startup = TRUE diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 259676c..dfe2051 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -182,7 +182,19 @@ ambient_2005: Error in as.data.frame(.Call(R_run_biocro, initial_values, paramet ) }) -test_that('Data-driver pairs must be complete', { +test_that('Data-driver pairs must have correct elements', { + expect_error( + objective_function( + model, + list(), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function + ), + '`data_driver_pairs` must have at least one element' + ) + expect_error( objective_function( model, @@ -195,6 +207,30 @@ test_that('Data-driver pairs must be complete', { 'The following data-driver pairs are missing at least one required element (drivers, data, weight): ambient_2002, ambient_2005', fixed = TRUE ) + + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$extra_element = 5}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function + ), + 'The following data-driver pairs have unexpected elements: ambient_2002. The allowed elements are: drivers, data, weight, data_stdev.' + ) + + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data_stdev = 5}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function + ), + 'The following data-driver pairs have a `data_stdev` element that does not match the columns and/or times of their `data` element: ambient_2002' + ) }) test_that('Data must have a `time` column', { @@ -272,6 +308,21 @@ test_that('Bad normalization methods are detected', { ) }) +test_that('Bad variance methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + variance_weight_method = 'bad_variance_method', + data_definitions = data_definitions, + post_process_function = post_process_function + ), + 'Unsupported variance_weight_method: bad_variance_method' + ) +}) + test_that('Bad return values are detected', { expect_error( objective_function( From 65b51da672c3174a44803eefd7feb1b27e56e9ef Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 13:32:07 -0500 Subject: [PATCH 31/54] Check data values and weights --- R/objective_function.R | 8 +++ R/objective_function_input_checks.R | 83 ++++++++++++++++++++++++ tests/testthat/test-objective_function.R | 55 +++++++++++++--- 3 files changed, 137 insertions(+), 9 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index e4ca11c..ec64971 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -41,6 +41,7 @@ objective_function <- function( full_data_definitions <- get_data_definition_list(data_driver_pairs, data_definitions) + # Print the full data definition list, if desired if (verbose_startup) { cat('\nThe full data definitions:\n\n') utils::str(full_data_definitions) @@ -80,15 +81,21 @@ objective_function <- function( variance_weight_method ) + # Print the long form data, if desired. Do this before checking the data, + # so the printout will be available for troubleshooting if (verbose_startup) { cat('\nThe user-supplied data in long form:\n\n') print(long_form_data) } + # Check the processed long-form data + check_long_form_data(long_form_data) + # Process the quantity weights full_quantity_weights <- process_quantity_weights(quantity_weights, long_form_data) + # Print the quantity weights, if desired if (verbose_startup) { cat('The user-supplied quantity weights:\n\n') utils::str(full_quantity_weights) @@ -97,6 +104,7 @@ objective_function <- function( # Get the data-driver pair weights ddp_weights <- get_ddp_weights(data_driver_pairs) + # Print the data-driver pair weights, if desired if (verbose_startup) { cat('\nThe user-supplied data-driver pair weights:\n\n') utils::str(ddp_weights) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 44fa7d9..0d4fa53 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -305,6 +305,89 @@ check_runner_results <- function( return(invisible(NULL)) } +# Helping function for checking the long-form data; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_long_form_data <- function(long_form_data) { + # Check each element for issues + messages <- sapply(long_form_data, function(lfd) { + msg <- character() + + # Check that certain columns have finite values + check_for_not_finite <- c( + 'time', + 'quantity_value', + 'quantity_stdev', + 'time_index', + 'expected_npts', + 'norm', + 'w_var' + ) + + not_finite <- sapply(check_for_not_finite, function(cn) { + any(!is.finite(lfd[, cn])) + }) + + if (any(not_finite)) { + not_finite_col <- check_for_not_finite[not_finite] + + new_msg <- paste( + 'The following columns contained non-finite values:', + paste(not_finite_col, collapse = ', ') + ) + + msg <- append(msg, new_msg) + } + + # Check that certain columns do not have negative values + check_for_negative <- c( + 'quantity_stdev', + 'time_index', + 'expected_npts', + 'norm', + 'w_var' + ) + + negative <- sapply(check_for_negative, function(cn) { + any(lfd[, cn] < 0) + }) + + if (any(negative)) { + negative_col <- check_for_negative[negative] + + new_msg <- paste( + 'The following columns contained negative values:', + paste(negative_col, collapse = ', ') + ) + + msg <- append(msg, new_msg) + } + + # Return any messages + paste(msg, collapse = '\n ') + }) + + # Send a message if problems were detected + error_found <- messages != '' + + if (any(error_found)) { + data_names <- names(long_form_data)[error_found] + error_messages <- messages[error_found] + + msg <- paste0( + 'Issues were found with the following data sets:\n ', + paste0( + data_names, ':\n ', + error_messages, + collapse = '\n ' + ) + ) + + stop(msg) + } + + return(invisible(NULL)) +} + # Helping function for checking the objective function; will throw an error if a # problem is detected, and will otherwise be silent with no return value. check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index dfe2051..a4d61bb 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -2,16 +2,24 @@ model <- BioCro::soybean model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] +convert_time <- function(x) { + within(x, { + time = (DOY - 1) * 24.0 + DOY = NULL + }) +} + ddps <- list( - ambient_2002 = list( - data = within(soyface_biomass[['ambient_2002']], {time = (DOY - 1) * 24.0; DOY = NULL}), - drivers = BioCro::soybean_weather[['2002']], - weight = 1 + ambient_2002 = list( + data = convert_time(soyface_biomass[['ambient_2002']]), + data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + drivers = BioCro::soybean_weather[['2002']], + weight = 1 ), - ambient_2005 = list( - data = within(soyface_biomass[['ambient_2005']], {time = (DOY - 1) * 24.0; DOY = NULL}), - drivers = BioCro::soybean_weather[['2005']], - weight = 2 + ambient_2005 = list( + data = convert_time(soyface_biomass[['ambient_2005']]), + drivers = BioCro::soybean_weather[['2005']], + weight = 2 ) ) @@ -264,10 +272,15 @@ ambient_2005: Pod', }) test_that('Out-of-range times are detected', { + time_offset <- 1e5 + expect_error( objective_function( model, - within(ddps, {ambient_2002$data$time <- ambient_2002$data$time + 1e5}), + within(ddps, { + ambient_2002$data$time <- ambient_2002$data$time + time_offset + ambient_2002$data_stdev$time <- ambient_2002$data_stdev$time + time_offset + }), independent_args, quantity_weights, data_definitions = data_definitions, @@ -352,3 +365,27 @@ test_that('Bad regularization methods are detected', { 'Unsupported regularization method: bad_regularization_method' ) }) + +test_that('Bad data values and weights are detected', { + expect_error( + objective_function( + model, + within(ddps, { + ambient_2005$data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]) + ambient_2005$data_stdev[['Leaf_Mg_per_ha']] <- -0.1 + }), + independent_args, + quantity_weights, + variance_weight_method = 'inverse', + data_definitions = data_definitions, + post_process_function = post_process_function + ), + 'Issues were found with the following data sets: + ambient_2002: + The following columns contained non-finite values: w_var + ambient_2005: + The following columns contained non-finite values: w_var + The following columns contained negative values: quantity_stdev', + fixed = TRUE + ) +}) From 1a0e1d176c06c1da7626c63af76480f17c84a3fc Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 15:10:08 -0500 Subject: [PATCH 32/54] Change verbose defaults --- R/objective_function.R | 2 +- man/objective_function.Rd | 8 ++- tests/testthat/test-objective_function.R | 68 ++++++++++++++++-------- 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index ec64971..55d18ff 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -13,7 +13,7 @@ objective_function <- function( post_process_function = NULL, extra_penalty_function = NULL, regularization_method = 'none', - verbose_startup = FALSE + verbose_startup = TRUE ) { # Check the data-driver pairs diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 259f885..36abb02 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -37,7 +37,7 @@ post_process_function = NULL, extra_penalty_function = NULL, regularization_method = 'none', - verbose_startup = FALSE + verbose_startup = TRUE ) } @@ -507,8 +507,7 @@ if (require(BioCro)) { } } - # Now we can finally create the objective function; here we choose to print - # extra startup information, but this is not strictly required. + # Now we can finally create the objective function obj_fun <- objective_function( base_model_definition, data_driver_pairs, @@ -518,8 +517,7 @@ if (require(BioCro)) { variance_weight_method = 'logarithm', dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, - extra_penalty_function = extra_penalty_function, - verbose_startup = TRUE + extra_penalty_function = extra_penalty_function ) # This function could now be passed to an optimizer; here we will simply diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index a4d61bb..3c91c8d 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -47,6 +47,8 @@ quantity_weights <- list( Pod = 1 ) +verbose_startup <- FALSE + # Run tests test_that('Objective functions can be created and behave as expected', { # Two data-driver pairs, no dependent arguments @@ -57,7 +59,8 @@ test_that('Objective functions can be created and behave as expected', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ) ) @@ -73,7 +76,8 @@ test_that('Objective functions can be created and behave as expected', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ) ) @@ -87,7 +91,8 @@ test_that('Objective functions can be created and behave as expected', { data_definitions = data_definitions, dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, - regularization_method = 'L2' + regularization_method = 'L2', + verbose_startup = verbose_startup ) ) @@ -110,7 +115,8 @@ test_that('Bad definitions are detected', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The following drivers did not form a valid dynamical system: ambient_2005' ) @@ -124,7 +130,8 @@ test_that('Independent and dependent arguments must have names', { as.numeric(independent_args), quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), '`independent_args` must have names' ) @@ -137,7 +144,8 @@ test_that('Independent and dependent arguments must have names', { quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, - dependent_arg_function = function(x) {1.0} + dependent_arg_function = function(x) {1.0}, + verbose_startup = verbose_startup ), 'The return value of `dependent_arg_function` must have names' ) @@ -150,7 +158,8 @@ test_that('Independent and dependent arguments must have names', { quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, - dependent_arg_function = function(x) {list(precip = 0.1)} + dependent_arg_function = function(x) {list(precip = 0.1)}, + verbose_startup = verbose_startup ), 'Some independent or dependent argument names refer to columns in the drivers: solar, precip' ) @@ -164,7 +173,8 @@ test_that('Bad argument names are detected', { c(independent_args, list(bad_arg_name = 1)), quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Model runners could not be created for the following drivers: ambient_2002: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers` @@ -181,7 +191,8 @@ test_that('Model failures are detected', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The model could not be run with the following drivers: ambient_2002: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported. @@ -198,7 +209,8 @@ test_that('Data-driver pairs must have correct elements', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), '`data_driver_pairs` must have at least one element' ) @@ -210,7 +222,8 @@ test_that('Data-driver pairs must have correct elements', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The following data-driver pairs are missing at least one required element (drivers, data, weight): ambient_2002, ambient_2005', fixed = TRUE @@ -223,7 +236,8 @@ test_that('Data-driver pairs must have correct elements', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The following data-driver pairs have unexpected elements: ambient_2002. The allowed elements are: drivers, data, weight, data_stdev.' ) @@ -235,7 +249,8 @@ test_that('Data-driver pairs must have correct elements', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The following data-driver pairs have a `data_stdev` element that does not match the columns and/or times of their `data` element: ambient_2002' ) @@ -249,7 +264,8 @@ test_that('Data must have a `time` column', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'The following data-driver pairs are missing a `time` column in their `data` element: ambient_2002' ) @@ -262,7 +278,8 @@ test_that('Missing simulation outputs are detected', { ddps, independent_args, quantity_weights, - data_definitions = data_definitions + data_definitions = data_definitions, + verbose_startup = verbose_startup ), 'Some data columns were missing from runner outputs: ambient_2002: Pod @@ -284,7 +301,8 @@ test_that('Out-of-range times are detected', { independent_args, quantity_weights, data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Some observed times were missing from runner outputs: ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', @@ -300,7 +318,8 @@ test_that('Weights must be supplied for all measured quantities', { independent_args, list(), data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Weights were not supplied for the following measured quantities: Leaf, Stem, Pod' ) @@ -315,7 +334,8 @@ test_that('Bad normalization methods are detected', { quantity_weights, normalization_method = 'bad_normalization_method', data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Unsupported normalization_method: bad_normalization_method' ) @@ -330,7 +350,8 @@ test_that('Bad variance methods are detected', { quantity_weights, variance_weight_method = 'bad_variance_method', data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Unsupported variance_weight_method: bad_variance_method' ) @@ -345,7 +366,8 @@ test_that('Bad return values are detected', { quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, - extra_penalty_function = function(x) {NA} + extra_penalty_function = function(x) {NA}, + verbose_startup = verbose_startup ), 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) @@ -360,7 +382,8 @@ test_that('Bad regularization methods are detected', { quantity_weights, data_definitions = data_definitions, post_process_function = post_process_function, - regularization_method = 'bad_regularization_method' + regularization_method = 'bad_regularization_method', + verbose_startup = verbose_startup ), 'Unsupported regularization method: bad_regularization_method' ) @@ -378,7 +401,8 @@ test_that('Bad data values and weights are detected', { quantity_weights, variance_weight_method = 'inverse', data_definitions = data_definitions, - post_process_function = post_process_function + post_process_function = post_process_function, + verbose_startup = verbose_startup ), 'Issues were found with the following data sets: ambient_2002: From 6fbc6ed8ab06f918ceb42210dc07d73bbbf72611 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 16:12:20 -0500 Subject: [PATCH 33/54] Document several commits --- R/objective_function.R | 6 +- R/objective_function_helpers.R | 10 +- man/objective_function.Rd | 166 +++++++++++++++++------ tests/testthat/test-objective_function.R | 6 +- 4 files changed, 132 insertions(+), 56 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 55d18ff..d5dd158 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -8,11 +8,11 @@ objective_function <- function( quantity_weights, data_definitions = list(), normalization_method = 'mean_max', - variance_weight_method = 'equal', + stdev_weight_method = 'equal', + regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, - regularization_method = 'none', verbose_startup = TRUE ) { @@ -78,7 +78,7 @@ objective_function <- function( # Add variance-based weights long_form_data <- add_w_var( long_form_data, - variance_weight_method + stdev_weight_method ) # Print the long form data, if desired. Do this before checking the data, diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index a9da812..ef395c6 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -198,20 +198,20 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { } # Helping function for getting variance-based weights -add_w_var <- function(long_form_data, variance_weight_method) { +add_w_var <- function(long_form_data, stdev_weight_method) { for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] data_stdev <- data_table[['quantity_stdev']] data_table[['w_var']] <- - if (tolower(variance_weight_method) == 'equal') { + if (tolower(stdev_weight_method) == 'equal') { 1.0 - } else if (tolower(variance_weight_method) == 'logarithm') { + } else if (tolower(stdev_weight_method) == 'logarithm') { log(1 / (data_stdev + 1e-5)) - } else if (tolower(variance_weight_method) == 'inverse') { + } else if (tolower(stdev_weight_method) == 'inverse') { 1 / data_stdev^2 } else { - stop('Unsupported variance_weight_method: ', variance_weight_method) + stop('Unsupported stdev_weight_method: ', stdev_weight_method) } long_form_data[[i]] <- data_table diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 36abb02..5d93d39 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -32,11 +32,11 @@ quantity_weights, data_definitions = list(), normalization_method = 'mean_max', - variance_weight_method = 'equal', + stdev_weight_method = 'equal', + regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, extra_penalty_function = NULL, - regularization_method = 'none', verbose_startup = TRUE ) } @@ -49,13 +49,27 @@ \item{data_driver_pairs}{ A list of named elements, where each element is a "data-driver pair." A - data-driver pair is a list with two elements, \code{data} and - \code{drivers}. The \code{data} element must be a data frame with one column - named \code{time}, whose values follow BioCro's definition of + data-driver pair is a list with three required elements: \code{data}, + \code{drivers}, and \code{weight}. Optionally, it may also have a + \code{data_stdev} element. + + The \code{data} element must be a data frame with one column named + \code{time}, whose values follow BioCro's definition of \code{\link[BioCro]{time}}; the other columns should represent observed - values of model outputs. The \code{drivers} element must be a data frame - that can be passed to \code{\link[BioCro]{run_biocro}} as its \code{drivers} - input argument. + values of model outputs. Any \code{NA} values in \code{data} will + be ignored when calculating the error metric. + + The \code{drivers} element must be a data frame that can be passed to + \code{\link[BioCro]{run_biocro}} as its \code{drivers} input argument. + + The \code{weight} element must be a single numeric value indicating a weight + to be used when calculating the error metric. + + The optional \code{data_stdev} element must be a data frame with the same + column names as \code{data}, and the same time values; its other entries + should represent the standard deviation associated with each entry in + \code{data}. If \code{data_stdev} is not supplied, all standard deviations + will be set to 1. } \item{independent_args}{ @@ -87,11 +101,16 @@ error metric; see below for more details. } - \item{variance_weight_method}{ + \item{stdev_weight_method}{ A string indicating the method to be used when calculating the variance-based weights used in the error metric; see below for more details. } + \item{regularization_method}{ + A string indicating the regularization method to be used when calculating + the regularization penalty term; see below for more details. + } + \item{dependent_arg_function}{ A function whose input argument is a named list of independent argument values, and which returns a named list of dependent argument values. If the @@ -115,11 +134,6 @@ added. } - \item{regularization_method}{ - A string indicating the regularization method to be used when calculating - the regularization penalty term; see below for more details. - } - \item{verbose_startup}{ A logical (\code{TRUE} or \code{FALSE}) value indicating whether to print additional information to the R terminal when creating the objective @@ -163,21 +177,22 @@ that produce close agreement with the observed data, but are nevertheless not biologically resonable. For example, it may produce a sharp peak at a high leaf mass in between two measured points, when - in reality, leaf mass should vary monotonically between them. In this + in reality, leaf mass should be nearly constant between them. In this case, it may be necessarily to add an extra penalty to the objective function that prevents the optimizer from choosing such values. Here, this is handled through the \code{extra_penalty_function} input. - \item \strong{Flexible weights:} Typically a user would like to specify a - weight for each variable being considered in the error metric, either - to represent an uncertainty or to emphasize agreement with one output + \item \strong{Flexible weights:} Often a user would like to specify a weight + for each variable being considered in the error metric, either to + represent an uncertainty or to emphasize agreement with one output at the expense of another. For example, the seed mass may need a high weight to prioritize accurate yield predictions. Further, these weights may need to differ for underestimates as compared to overestimates; for example, measured root mass is often lower than the true root mass, so a user may wish to penalize underestimates of root - mass more severely than overestimates. Here, this is handled through - the \code{quantity_weights} input. + mass more severely than overestimates. Here, these situations are + handled through the \code{quantity_weights} and + \code{stdev_weight_method} inputs. \item \strong{Dependent parameters:} Sometimes one model parameter must be determined from one or more other parameters; for example, a user may @@ -217,13 +232,15 @@ \item \strong{Agreement term:} The agreement term \eqn{E_{agreement}} is calculated using a least-squares approach. In other words, - \deqn{E_{agreement} =% - \sum_i \frac{\left(Y_{obs}^i - Y_{mod}^i \right)^2}{N_i} \cdot w_i,} - where the sum runs over all \eqn{n} observations, \eqn{Y_{obs}^i} and - \eqn{Y_{mod}^i} are observed and modeled values of variable \eqn{Y_i}, - \eqn{N_i} is a normalization factor that depends on the name of - \eqn{Y_i} and the data set that includes the \eqn{i^{th}} observation, - and \eqn{w_i} is a weight factor that depends on the name of \eqn{Y}. + \deqn{E_{agreement} = \sum_i \left(Y_{obs}^i - Y_{mod}^i \right)^2% + \cdot \frac{w_i^{quantity} w_i^{data} w_i^{stdev}}{N_i},} + where the sum runs over all \eqn{n} observations; \eqn{Y_{obs}^i} and + \eqn{Y_{mod}^i} are observed and modeled values of variable \eqn{Y_i}; + \eqn{w_i^{quantity}}, \eqn{w_i^{data}}, and \eqn{w_i^{stdev}} are + weight factors that depend on the name of \eqn{Y_i}, the data set that + includes the \eqn{i^{th}} observation, and the standard deviation + associated with \eqn{Y_{obs}^i}, respectively; and \eqn{N_i} is a + normalization factor. Each value of \eqn{Y_{obs}^i} is specified at a particular time \eqn{t_i}. The corresponding modeled value, \eqn{Y_{mod}^i}, is found @@ -233,21 +250,33 @@ is run with a particular set of drivers, regardless of the input argument values. + The quantity-based weight factors \eqn{w_i^{quantity}} are directly + specified by the user via the \code{quantity_weights} input. For + example, if \code{quantity_weights} has an element named \code{Leaf} + that is equal to 0.5, then \eqn{w_i} will be equal to 0.5 whenever + \eqn{Y_i} represents a leaf mass value, regardless of which set of + drivers or time point corresponds to \eqn{Y_i}. The weights can also + be supplied as \eqn{(w_{under}, w_{over})} pairs instead of single + values; in this case, the value of \eqn{w_i} depends on whether the + model makes an underprediction or an overprediction: + \eqn{w_i = w_{under}} when \eqn{Y_{mod}^i < Y_{obs}^i} and + \eqn{w_i = w_{over}} otherwise. + + The data-set-based weight factors \eqn{w_i^{data}} are directly + specified by the user via the \code{weight} element of each + data-driver pair. For example, if the second element of + \code{data_driver_pairs} has a \code{weight} of 2.0, then + \eqn{w_i^{data}} will be equal to 2.0 for all observations from the + corresponding data set. + + The standard-deviation-based weight factors \eqn{w_i^{stdev}} are + determined by the choice of \code{stdev_weight_method}; the available + methods are discussed below. + The normalization factors \eqn{N_i} are determined by the choice of \code{normalization_method}; the available methods are discussed below. - The weight factors \eqn{w_i} are directly specified by the user via - the \code{quantity_weights} input. For example, if - \code{quantity_weights} has an element named \code{Leaf} that is equal - to 0.5, then \eqn{w_i} will be equal to 0.5 whenever \eqn{Y_i} - represents a leaf mass value, regardless of which set of drivers or - time point corresponds to \eqn{Y_i}. The weights can also be supplied - as \eqn{(w_{under}, w_{over})} pairs instead of single values; in this - case, the value of \eqn{w_i} depends on whether the model makes an - underprediction or an overprediction: \eqn{w_i = w_{under}} when - \eqn{Y_{mod}^i < Y_{obs}^i} and \eqn{w_i = w_{over}} otherwise. - There are a few special cases where \eqn{E_{agreement}} is set to a very high value (\code{BioCroValidation:::FAILURE_VALUE}). This is done when a simulation fails to run, when the \eqn{E_{agreement}} term @@ -281,6 +310,40 @@ below. } + \strong{Standard-deviation-based weight methods} + + The following methods are available for determining weight factors from values + of the standard deviation (\eqn{\sigma}), which can be (optionally) supplied + via the \code{data_stdev} elements of the \code{data_driver_pairs}: + + \itemize{ + \item \code{'equal'}: For this method, \eqn{w_i^{stdev}} is always set to + 1. In other words, all variances are treated as being equal, + regardless of any user-supplied values. This is usually the best + choice when values of \eqn{\sigma} are unavailable or cannot be + estimated. + + \item \code{'logarithm'}: For this method, \eqn{w_i^{stdev}} is calculated + as + + \deqn{w_i^{stdev} =% + ln \left( \frac{1}{\sigma + 10^{-5}} \right),} + where \eqn{ln} denotes a logarithm with base \eqn{e}. This method + was used in the original Soybean-BioCro paper. Note: this method + should be used with caution, because \eqn{w_i^{stdev}} is zero + for \eqn{\sigma = 1 - 10^{-5} = 0.99999}, and it becomes negative for + larger values of \eqn{\sigma}. + + \item \code{'inverse'}: For this method, \eqn{w_i^{stdev}} is calculated as + + \deqn{w_i^{stdev} = \frac{1}{\sigma^2}.} + Note: this method should be used with caution, because + \eqn{w_i^{stdev}} is infinite when \eqn{\sigma} is zero. + } + + If any values of \eqn{w_i^{stdev}} are undefined, negative, or infinite, an + error message will occur (see the "Input checks" section below). + \strong{Normalization methods} The following normalization methods are available: @@ -292,13 +355,16 @@ \item \code{'mean'}: For this method, when \eqn{Y_i} is named \code{vtype} and the observation is from a set called \code{vdata}, then - \deqn{N_i = n_{vtype}^{vdata},} + \deqn{N_i = n_{vtype}^{vdata} \cdot n_{data},} where \eqn{n_{vtype}^{vdata}} is the number of observations of type - \code{vtype} that are included in \code{vdata}. In this case, the - error term \eqn{E_{agreement}} becomes a mean error for each set of + \code{vtype} that are included in \code{vdata} and \eqn{n_{data}} is + the total number of data-driver pairs. In this case, the error term + \eqn{E_{agreement}} becomes a mean error across the full set of drivers, hence the name for this method. This approach avoids over-representing drivers with larger numbers of associated - observations when determining \eqn{E_{agreement}}. + observations when determining \eqn{E_{agreement}}. It also preserves + the overall magnitude of \eqn{E_{agreement}} when data-driver pairs + are added. \item \code{'max'}: For this method, when \eqn{Y_i} is named \code{vtype} and the observation is from a set called \code{vdata}, then @@ -314,8 +380,8 @@ \item \code{'mean_max'}: For this method, the "mean" and "max" methods are combined so that - \deqn{N_i =% - n_{vtype}^{vdrivers} \cdot \left( max_{vtype}^{vdata} \right)^2.} + \deqn{N_i = n_{vtype}^{vdrivers}% + \cdot n_{data} \cdot \left( max_{vtype}^{vdata} \right)^2.} This approach avoids over-representing drivers with larger numbers of associated observations, and variable types with larger magnitudes. This method is used for parameterizing Soybean-BioCro, and is @@ -381,6 +447,11 @@ \item Ensuring that the optional \code{dependent_arg_function}, \code{post_process_function}, and \code{extra_penalty_function} functions can be run without causing errors. + + \item Ensuring that certain values are finite (such as \eqn{Y_{obs}}, + \eqn{\sigma}, \eqn{w_i^{stdev}}, and \eqn{N}), and that certain values + are not negative (such as \eqn{\sigma}, \eqn{w_i^{stdev}}, and + \eqn{N}). } If any issues are detected, an informative error message will be sent. Note @@ -388,6 +459,11 @@ drivers. For these checks, the argument values specified by \code{independent_args} will be used, so they should be valid or otherwise reasonable values. + + If an error message occurs when \code{verbose_startup} was set to + \code{FALSE}, it is recommended to call this function again with + \code{verbose_startup} set to \code{TRUE}, since the additional output can be + helpful for troubleshooting. } \value{ @@ -514,7 +590,7 @@ if (require(BioCro)) { independent_args, quantity_weights, data_definitions = data_definitions, - variance_weight_method = 'logarithm', + stdev_weight_method = 'logarithm', dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, extra_penalty_function = extra_penalty_function diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 3c91c8d..61e2b62 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -348,12 +348,12 @@ test_that('Bad variance methods are detected', { ddps, independent_args, quantity_weights, - variance_weight_method = 'bad_variance_method', + stdev_weight_method = 'bad_stdev_method', data_definitions = data_definitions, post_process_function = post_process_function, verbose_startup = verbose_startup ), - 'Unsupported variance_weight_method: bad_variance_method' + 'Unsupported stdev_weight_method: bad_stdev_method' ) }) @@ -399,7 +399,7 @@ test_that('Bad data values and weights are detected', { }), independent_args, quantity_weights, - variance_weight_method = 'inverse', + stdev_weight_method = 'inverse', data_definitions = data_definitions, post_process_function = post_process_function, verbose_startup = verbose_startup From 64402cbdd68690640554942b1a23564e7b0f716e Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 16:20:09 -0500 Subject: [PATCH 34/54] Allow and test R v3.6.0 And move BioCro to Imports --- .github/workflows/R-CMD-check.yaml | 2 +- DESCRIPTION | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 94b56af..acfefe6 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -25,7 +25,7 @@ jobs: - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} - - {os: ubuntu-latest, r: '4.0.0'} + - {os: ubuntu-latest, r: '3.6.0'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index ef57a21..2d6cb3d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,9 +12,10 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Depends: - R (>= 4.0.0) -Suggests: - testthat (>= 3.0.0), + R (>= 3.6.0) +Imports: BioCro (>= 3.0.0) +Suggests: + testthat (>= 3.0.0) URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ Config/testthat/edition: 3 From c54ef405aa6f1cc2786980180fa2f41ca0d5c053 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 16:26:48 -0500 Subject: [PATCH 35/54] Check for multiple "closest" times --- R/objective_function_helpers.R | 4 +++- tests/testthat/test-objective_function.R | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index ef395c6..13770ec 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -155,7 +155,9 @@ add_time_indices <- function(initial_runner_res, long_form_data) { indices <- sapply(dataf[, 'time'], function(x) { tdiff <- abs(res[, 'time'] - x) - which(tdiff == min(tdiff)) + + # Take only the first match, in case there are more + which(tdiff == min(tdiff))[1] }) long_form_data[[i]][, 'time_index'] <- indices diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 61e2b62..3551261 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -310,6 +310,27 @@ ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', ) }) +test_that('Multiple time matches are handled', { + # The drivers have a time step of 1, so if we specify half-integer times, + # there will actually be two "closest" points to each observed time. + time_offset <- 0.5 + + expect_silent( + objective_function( + model, + within(ddps, { + ambient_2002$data$time <- ambient_2002$data$time + time_offset + ambient_2002$data_stdev$time <- ambient_2002$data_stdev$time + time_offset + }), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ) + ) +}) + test_that('Weights must be supplied for all measured quantities', { expect_error( objective_function( From e5aa56e9081de0a5c0ae27999309b89946298f6a Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 16:56:02 -0500 Subject: [PATCH 36/54] Include more info in time range error message --- R/objective_function_input_checks.R | 14 +++++++++++--- tests/testthat/test-objective_function.R | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 0d4fa53..8df1f63 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -277,11 +277,11 @@ check_runner_results <- function( datat < min_time || datat > max_time }) - data_times[oor] + c(min_time, max_time, data_times[oor]) }) bad_times <- sapply(times_out_of_range, function(x) { - length(x) > 0 + length(x) > 2 }) if (any(bad_times)) { @@ -289,11 +289,19 @@ check_runner_results <- function( for (i in seq_along(bad_times)) { if (bad_times[i]) { + raw_times_oor <- times_out_of_range[[i]] + + min_time <- raw_times_oor[1] + max_time <- raw_times_oor[2] + time_oor <- raw_times_oor[seq(3, length(raw_times_oor))] + msg <- append( msg, paste0( names(initial_runner_res)[i], ': ', - paste(times_out_of_range[[i]], collapse = ', ') + paste(time_oor, collapse = ', '), + ' (min_time = ', min_time, + ', max_time = ', max_time, ')' ) ) } diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 3551261..29202bb 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -305,7 +305,7 @@ test_that('Out-of-range times are detected', { verbose_startup = verbose_startup ), 'Some observed times were missing from runner outputs: -ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888', +ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888 (min_time = 3624, max_time = 6911)', fixed = TRUE ) }) From 43df53666ed7e735ea694d956ee67118dbd49776 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 9 May 2025 17:08:26 -0500 Subject: [PATCH 37/54] Require newer BioCro Some examples fail for earlier versions due to `time` differences --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2d6cb3d..3a85ad4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ LazyData: true Depends: R (>= 3.6.0) Imports: - BioCro (>= 3.0.0) + BioCro (>= 3.2.0) Suggests: testthat (>= 3.0.0) URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ From b6a5889d4d54fd8220c6ccd83c02778260b6772f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 12 May 2025 14:26:22 -0500 Subject: [PATCH 38/54] Include full biomass data sets --- data/soyface_biomass.R | 84 +++++++++++------------- man/objective_function.Rd | 26 +++++--- man/soyface_biomass.Rd | 19 +++++- tests/testthat/test-objective_function.R | 19 +++--- 4 files changed, 85 insertions(+), 63 deletions(-) diff --git a/data/soyface_biomass.R b/data/soyface_biomass.R index bb4e1b9..dc36357 100644 --- a/data/soyface_biomass.R +++ b/data/soyface_biomass.R @@ -1,58 +1,54 @@ soyface_biomass <- list( - ambient_2002 = utils::read.table( + ambient_2002 = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 179 0.180284339 0.085244969 0 - 189 0.554461942 0.418853893 0 - 203 1.326552931 1.711067366 0.000317148 - 217 1.697944007 2.892825897 0.079396325 - 231 1.807742782 3.68591426 1.554571303 - 246 1.578813648 3.745260717 3.97601356 - 259 0.947537773 3.618401575 6.544650656 - 288 0 2.305701225 7.008967628 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.180284339443109,0.085244969371991,0,0,0,0 + 189,0.55446194221275,0.41885389322975,0,0,0,0 + 203,1.32655293077725,1.71106736644195,0.0003171478564925,0,0,0 + 217,1.69794400686295,2.8928258965309,0.07939632545295,0,0,0 + 231,1.80774278200725,3.6859142604218,1.55457130346238,0,0.3818897637489,0.3818897637489 + 246,1.57881364816734,3.74526071710995,3.97601356048602,2.480314960431,0.281423009601227,0.663312773350127 + 259,0.947537773327332,3.61840157451295,6.5446506556431,4.846894137844,0.315325021846977,0.978637795197104 + 288,0,2.30570122466198,7.00896762848425,5.4133858263375,1.60453886688794,2.58317666208504'), header = TRUE ), - ambient_2005 = utils::read.table( + ambient_2005 = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 172 0.222271875 0.188803125 0 - 186 0.8460375 0.85220625 0 - 200 1.184465625 1.61896875 0 - 214 2.218059375 4.043615625 0.29925 - 228 2.147446875 4.477725 2.304553125 - 242 1.51948125 3.8920875 5.532778125 - 256 0.06575625 2.89905 5.371078125 - 270 0 2.1756 6.372253125 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.222271875,0.188803125,0,0,0,0 + 186,0.8460375,0.85220625,0,0,0,0 + 200,1.184465625,1.61896875,0,0,0,0 + 214,2.218059375,4.043615625,0.29925,0,0.041475,0.06654375 + 228,2.147446875,4.477725,2.304553125,0,0.153628125,0.18230625 + 242,1.51948125,3.8920875,5.532778125,3.022490625,0.1157625,0.335934375 + 256,0.06575625,2.89905,5.371078125,3.998203125,0.5310375,0.866971875 + 270,0,2.1756,6.372253125,4.965646875,0.281465625,1.1484375'), header = TRUE ), - ambient_2002_std = utils::read.table( + ambient_2002_std = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 179 0.04081555 0.017079737 0 - 189 0.163863274 0.138449025 0 - 203 0.133774434 0.183710759 0.000549316 - 217 0.228326658 0.448744065 0.030989998 - 231 0.202475421 0.453447471 0.218402544 - 246 0.075475165 0.275321356 0.573548828 - 259 0.344550033 0.151045378 0.743440701 - 288 0 0.148389261 0.141828617 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.040815550055146,0.0170797372473185,0,0,0,0 + 189,0.163863273875505,0.138449024803329,0,0,0,0 + 203,0.133774433506304,0.18371075937777,0.000549316200956573,0,0,0 + 217,0.228326657644473,0.448744065190288,0.0309899984999309,0,0,0 + 231,0.202475421469591,0.453447470747275,0.218402543503905,0,0.0456182274998721,0.0456182274998721 + 246,0.0754751653831798,0.275321356095752,0.573548828062959,0.36605836251352,0.0724761221542088,0.0676627659537074 + 259,0.344550032518756,0.151045377683595,0.743440701063986,0.514421560164731,0.0495275235053204,0.0320523860205109 + 288,0,0.148389260861146,0.141828616948454,0.115260823492553,0.275546039297454,0.267829516105734'), header = TRUE ), - ambient_2005_std = utils::read.table( + ambient_2005_std = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 172 0.032896589 0.014318136 0 - 186 0.146798299 0.198830061 0 - 200 0.338074288 0.605286253 0 - 214 0.152175913 0.559874052 0.164275197 - 228 0.119077589 0.306744644 0.434148074 - 242 0.512808699 0.379108485 0.588476976 - 256 0.061686243 0.220823981 0.52004438 - 270 0 0.243254729 0.633090858 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.0328965891267146,0.0143181358308921,0,0,0,0 + 186,0.146798299121618,0.19883006141002,0,0,0,0 + 200,0.338074287830052,0.605286252689843,0,0,0,0 + 214,0.152175913412888,0.559874052059856,0.16427519712551,0,0.0496298580745502,0.0637084579721366 + 228,0.119077588733867,0.30674464428393,0.434148073671929,0,0.0171152969203539,0.056246865537668 + 242,0.512808698911872,0.379108485359897,0.588476975732098,0.341714781741362,0.0100661134816274,0.073342888722574 + 256,0.0616862433382638,0.220823980737487,0.520044379820051,0.398956753155868,0.179630286326179,0.214176625850131 + 270,0,0.243254729045758,0.633090857876002,0.507224090206239,0.0520489826201903,0.246267463689126'), header = TRUE ) ) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 5d93d39..93c6dfe 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -505,11 +505,19 @@ if (require(BioCro)) { # in two data tables; it also includes the standard deviations of the measured # biomasses, which are included in two separate tables. However, these data # tables each have a `DOY` column rather than a `time` column, so we need to - # alter them. Here we will do this by defining a short helper function. - convert_time <- function(x) { + # alter them. The tables also include other columns we do not wish to use in + # this example. So, we will define a short helper function that can be used to + # pre-process each table. + process_table <- function(x) { within(x, { - time = (DOY - 1) * 24.0 # Define new `time` column - DOY = NULL # Remove unneeded `DOY` column + # Define new `time` column + time = (DOY - 1) * 24.0 + + # Remove unneeded columns + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL }) } @@ -518,14 +526,14 @@ if (require(BioCro)) { # heavily as the 2002 data. data_driver_pairs <- list( ambient_2002 = list( - data = convert_time(soyface_biomass[['ambient_2002']]), - data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = convert_time(soyface_biomass[['ambient_2005']]), - data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]), + data = process_table(soyface_biomass[['ambient_2005']]), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']]), drivers = BioCro::soybean_weather[['2005']], weight = 2 ) @@ -538,7 +546,7 @@ if (require(BioCro)) { data_definitions <- list( Leaf_Mg_per_ha = 'Leaf', Stem_Mg_per_ha = 'Stem', - Pod_Mg_per_ha = 'Pod' + Rep_Mg_per_ha = 'Pod' ) # The data contains values of pod mass, but the model does not calculate pod diff --git a/man/soyface_biomass.Rd b/man/soyface_biomass.Rd index adf073e..e8a0dd7 100644 --- a/man/soyface_biomass.Rd +++ b/man/soyface_biomass.Rd @@ -20,7 +20,12 @@ \item \code{DOY}: The day of year \item \code{Leaf_Mg_per_ha}: Leaf biomass per area expressed in Mg / ha \item \code{Stem_Mg_per_ha}: Stem biomass per area expressed in Mg / ha - \item \code{Pod_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + \item \code{Rep_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + \item \code{Seed_Mg_per_ha}: Seed biomass per area expressed in Mg / ha + \item \code{Litter_Mg_per_ha}: Mass of leaf litter accumulated between + harvests, expressed in Mg / ha + \item \code{CumLitter_Mg_per_ha}: Cumulative leaf litter biomass per aear + expressed in Mg / ha } The elements named \code{ambient_2002} and \code{ambient_2005} represent mean @@ -33,7 +38,7 @@ } \source{ - This data was obtained from several files available from the + The leaf, stem, and pod data was obtained from several files in from the \href{https://github.com/cropsinsilico/soybean-biocro}{Soybean-BioCro GitHub repository}: \itemize{ \item \code{Data/SoyFACE_data/2002_ambient_biomass.csv} @@ -43,6 +48,16 @@ } See that repository for more information. + + The leaf litter accumulated between harvests was obtained from the original + sources. The cumulative leaf litter was calculated from the amount accumulated + between harvests. The seed mass was estimated as a fraction of the total pod + mass, using proportionality factors determined from unpublished data collected + in Champaign, Illinois during 2021-2022. + + These data tables have not been published previously, but were used to + parameterize Soybean-BioCro as used in He \emph{et al.} 2024 + (\doi{10.1093/insilicoplants/diae009}). } \keyword{datasets} diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 29202bb..d0fe2c3 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -2,22 +2,25 @@ model <- BioCro::soybean model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] -convert_time <- function(x) { +process_table <- function(x) { within(x, { - time = (DOY - 1) * 24.0 - DOY = NULL + time = (DOY - 1) * 24.0 + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL }) } ddps <- list( ambient_2002 = list( - data = convert_time(soyface_biomass[['ambient_2002']]), - data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = convert_time(soyface_biomass[['ambient_2005']]), + data = process_table(soyface_biomass[['ambient_2005']]), drivers = BioCro::soybean_weather[['2005']], weight = 2 ) @@ -30,7 +33,7 @@ independent_args <- with(BioCro::soybean[['parameters']], { data_definitions <- list( Leaf_Mg_per_ha = 'Leaf', Stem_Mg_per_ha = 'Stem', - Pod_Mg_per_ha = 'Pod' + Rep_Mg_per_ha = 'Pod' ) dependent_arg_function <- function(x) { @@ -415,7 +418,7 @@ test_that('Bad data values and weights are detected', { objective_function( model, within(ddps, { - ambient_2005$data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]) + ambient_2005$data_stdev = process_table(soyface_biomass[['ambient_2005_std']]) ambient_2005$data_stdev[['Leaf_Mg_per_ha']] <- -0.1 }), independent_args, From 1ae0e1ca11809b2be04ce031527fd76754ca5882 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 12 May 2025 22:42:15 -0500 Subject: [PATCH 39/54] Start adding vignette --- .Rbuildignore | 3 + DESCRIPTION | 6 + vignettes/.gitignore | 2 + vignettes/parameterizing_soybean_biocro.Rmd | 204 ++++++++++++++++++++ vignettes/references.bib | 46 +++++ 5 files changed, 261 insertions(+) create mode 100644 vignettes/.gitignore create mode 100644 vignettes/parameterizing_soybean_biocro.Rmd create mode 100644 vignettes/references.bib diff --git a/.Rbuildignore b/.Rbuildignore index 3c48ecc..69c5505 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,6 +4,9 @@ # reason, there should not be any blank comment lines, because they would # exclude any files with `#` in the filename, possibly causing confusing issues. +^Rhistory$ +# History files + ^.*\.Rproj$ # Designates the directory as an RStudio Project. diff --git a/DESCRIPTION b/DESCRIPTION index 3a85ad4..72f0f01 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,12 @@ Depends: Imports: BioCro (>= 3.2.0) Suggests: + knitr, + rmarkdown, testthat (>= 3.0.0) +VignetteBuilder: knitr +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: true URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ Config/testthat/edition: 3 diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd new file mode 100644 index 0000000..b746a29 --- /dev/null +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -0,0 +1,204 @@ +--- +title: "Parameterizing Soybean-BioCro" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Parameterizing Soybean-BioCro} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +bibliography: references.bib +link-citations: yes +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7.5, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +This article shows how to create an objective function that can be used to +parameterize BioCro's soybean model +[@matthews_soybean_biocro_2021; @lochocki_biocro_2022]. + +Since the original publication of Soybean-BioCro, the BioCro module library has +undergone several changes, and the model has been re-parameterized several +times. These parameterizations have not used `BioCroValidation`, since they were +performed before `BioCroValidation` was created. + +Here, we re-create the objective function that was used for the parameterization +included in version `3.2.0` of the BioCro R package. + +In the commands below, we will use functions from several libraries, so we will +load them now: + +```{r libraries} +# Load required libraries +library(BioCroValidation) +library(BioCro) +``` + +# Building the Objective Function + +In this section, we will use the `objective_function` function from +`BioCroValidation` package to create an objective function that can be used to +parameterize Soybean-BioCro. + +## The Observed Data + +The observed data needed to parameterize Soybean-BioCro is included in the +`BioCroValidation` package as the `soyface_biomass` data set, which consists of +two years (2002 and 2005) of biomass data and associated standard deviations, +included in four separate tables. However, each table requires some +pre-processing to get it ready. + +One issue is that the data set specifies the doy of year (DOY) for each harvest, +but we need to specify the time using BioCro's convention (the number of hours +since the start of the year). This can be addressed by defining a helper +function that adds a new `time` column following BioCro's convention: + +```{r convert_time} +# Define a helping function for adding a `time` column +convert_time <- function(data_table) { + within(data_table, { + # Define new `time` column + time = (DOY - 1) * 24.0 + }) +} +``` + +Another issue is that the data set includes pod and seed values, but +Soybean-BioCro calculates shell and seed masses, where the shell and seed +together comprise the pod. This can also be addressed with helper functions, +which will be different for biomass values and standard deviations: + +```{r get_shell} +# Define a helping function for calculating shell biomass +add_shell_biomass <- function(data_table) { + within(data_table, { + Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha + }) +} + +# Define a helping function for calculating shell biomass standard deviation +add_shell_stdev <- function(data_table) { + within(data_table, { + Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) + }) +} +``` + +Finally, the data set includes some values that are not needed for the +parameterization. This includes the leaf litter accumulated between each +harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been +superseded by other columns defined above. These can be removed with a final +helper function: + +```{r remove_columns} +# Define a helping function for removing unneeded columns +remove_extra_columns <- function(data_table) { + within(data_table, { + DOY = NULL + Rep_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + }) +} +``` + +Now we can apply these to each table in the set: + +```{r process_tables} +# Process the data sets +ambient_2002_biomass <- remove_extra_columns( + add_shell_biomass( + convert_time( + soyface_biomass[['ambient_2002']] + ) + ) +) + + +ambient_2005_biomass <- remove_extra_columns( + add_shell_biomass( + convert_time( + soyface_biomass[['ambient_2005']] + ) + ) +) + +ambient_2002_stdev <- remove_extra_columns( + add_shell_stdev( + convert_time( + soyface_biomass[['ambient_2002_std']] + ) + ) +) + + +ambient_2005_stdev <- remove_extra_columns( + add_shell_stdev( + convert_time( + soyface_biomass[['ambient_2005_std']] + ) + ) +) +``` + +## The Data-Driver Pairs + +The `BioCro` R package includes weather data for the years in the +`soyface_biomass` data set. So now we are ready to define the data-driver pairs, +which includes the weather, the observed biomass, the standard deviation of the +observed biomass, and the weight to assign to each year: + +```{r data_driver_pairs} +# Define the data-driver pairs +data_driver_pairs <- list( + ambient_2002 = list( + data = ambient_2002_biomass, + data_stdev = ambient_2002_stdev, + drivers = BioCro::soybean_weather[['2002']], + weight = 1 + ), + ambient_2005 = list( + data = ambient_2005_biomass, + data_stdev = ambient_2005_stdev, + drivers = BioCro::soybean_weather[['2005']], + weight = 1 + ) +) +``` + +Here we have chosen equal weights for the two years. + +# Commands From This Document + +```{r, eval = FALSE} +<> + +### +### Define helping functions +### + +<<>> + +<> + +<> + +### +### Prepare inputs for `objective_function` +### + +<> + +``` + +# References diff --git a/vignettes/references.bib b/vignettes/references.bib new file mode 100644 index 0000000..fec18e3 --- /dev/null +++ b/vignettes/references.bib @@ -0,0 +1,46 @@ + +@article{lochocki_biocro_2022, + author = {Lochocki, Edward B and Rohde, Scott and Jaiswal, Deepak and Matthews, Megan L and Miguez, Fernando and Long, Stephen P and McGrath, Justin M}, + title = {BioCro II: a software package for modular crop growth simulations}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diac003}, + year = {2022}, + month = {02}, + doi = {10.1093/insilicoplants/diac003}, +} + +@article{matthews_soybean_biocro_2021, + author = {Matthews, Megan L and Marshall-Colón, Amy and McGrath, Justin M and Lochocki, Edward B and Long, Stephen P}, + title = {Soybean-BioCro: a semi-mechanistic model of soybean growth}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diab032}, + year = {2021}, + month = {12}, + doi = {10.1093/insilicoplants/diab032}, +} + +@article{he_seasonal_2023, + title = {Seasonal climate conditions impact the effectiveness of improving photosynthesis to increase soybean yield}, + journal = {Field Crops Research}, + volume = {296}, + pages = {108907}, + year = {2023}, + doi = {https://doi.org/10.1016/j.fcr.2023.108907}, + author = {Yufeng He and Megan L. Matthews}, +} + +@article{he_connecting_2024, + author = {He, Yufeng and Wang, Yu and Friedel, Douglas and Lang, Meagan and Matthews, Megan L}, + title = {Connecting detailed photosynthetic kinetics to crop growth and yield: a coupled modelling framework}, + journal = {in silico Plants}, + volume = {6}, + number = {2}, + pages = {diae009}, + year = {2024}, + month = {06}, + doi = {10.1093/insilicoplants/diae009}, +} From 4671308357fcf0c3b3398488dc98a3ecdcc0d04d Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 20 May 2025 17:44:02 -0500 Subject: [PATCH 40/54] Add more verbose printing --- R/objective_function.R | 3 ++- R/objective_function_input_checks.R | 19 ++++++++++++++++--- man/objective_function.Rd | 8 +------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index d5dd158..1730e29 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -23,7 +23,8 @@ objective_function <- function( check_args_to_vary( independent_args, dependent_arg_function, - data_driver_pairs + data_driver_pairs, + verbose_startup ) # Get the model runners diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 8df1f63..dd37fc9 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -148,7 +148,8 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { check_args_to_vary <- function( independent_args, dependent_arg_function, - data_driver_pairs + data_driver_pairs, + verbose ) { # Make sure the independent arguments have names @@ -168,6 +169,15 @@ check_args_to_vary <- function( } } + # Print argument names, if necessary + if (verbose) { + cat('\nThe independent arguments and their initial values:\n\n') + utils::str(independent_args) + + cat('\nThe dependent arguments and their initial values:\n\n') + utils::str(dependent_arg_function(independent_args)) + } + # Make sure no drivers were specified arg_names <- get_full_arg_names(independent_args, dependent_arg_function) @@ -402,12 +412,15 @@ check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { initial_error_terms <- obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) + initial_error <- sum(unlist(initial_error_terms)) + if (verbose) { cat('\nThe initial error metric terms:\n\n') utils::str(initial_error_terms) - } - initial_error <- sum(unlist(initial_error_terms)) + cat('\nThe initial error metric value:\n\n') + print(initial_error) + } if (!is.finite(initial_error)) { stop( diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 93c6dfe..6eac3d5 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -607,13 +607,7 @@ if (require(BioCro)) { # This function could now be passed to an optimizer; here we will simply # evaluate it for two sets of parameter values. - # First try the initial values. - cat('\nError metric calculated from initial argument values:\n') - print( - obj_fun(as.numeric(independent_args)) - ) - - # Now try doubling each parameter value; in this case, the value of the + # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. cat('\nError metric calculated by doubling the initial argument values:\n') From 3dd88824f0f0476a233076ce7b90e9ea56b9a2f1 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Wed, 21 May 2025 16:10:52 -0500 Subject: [PATCH 41/54] Add more detailed terms to output --- R/objective_function_helpers.R | 8 +++++++- man/objective_function.Rd | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 13770ec..3805ce8 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -348,8 +348,14 @@ error_from_res <- function( # Return the sum of the penalty and error terms, or the individual errors if (return_terms) { + error_terms_by_quantity <- as.list(tapply( + errors, + long_form_data_table[['quantity_name']], + sum + )) + list( - least_squares_term = error_sum, + least_squares_terms = error_terms_by_quantity, extra_penalty = penalty ) } else { diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 6eac3d5..b606011 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -610,13 +610,13 @@ if (require(BioCro)) { # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. - cat('\nError metric calculated by doubling the initial argument values:\n') + cat('\nError metric calculated by doubling the initial argument values:\n\n') print( obj_fun(2 * as.numeric(independent_args)) ) # We can also see the values of each term that makes up the error metric - cat('\nError metric terms calculated by doubling the initial argument values:\n') + cat('\nError metric terms calculated by doubling the initial argument values:\n\n') str( obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) ) From feeb07c1f849831e819c76fa36952b6b1a983cf9 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Wed, 21 May 2025 16:11:22 -0500 Subject: [PATCH 42/54] Finish calculating objective function in vignette --- vignettes/parameterizing_soybean_biocro.Rmd | 402 ++++++++++++++++++-- vignettes/references.bib | 11 + 2 files changed, 372 insertions(+), 41 deletions(-) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index b746a29..a0abc0d 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -30,11 +30,13 @@ parameterize BioCro's soybean model Since the original publication of Soybean-BioCro, the BioCro module library has undergone several changes, and the model has been re-parameterized several -times. These parameterizations have not used `BioCroValidation`, since they were +times. These parameterizations did not use `BioCroValidation`, since they were performed before `BioCroValidation` was created. -Here, we re-create the objective function that was used for the parameterization -included in version `3.2.0` of the BioCro R package. +However, `BioCroValidation` is able to re-create the objective functions that +were used for these parameterizations. Here, we re-create the objective function +that was used for the parameterization included in version `3.2.0` of the BioCro +R package. In the commands below, we will use functions from several libraries, so we will load them now: @@ -43,13 +45,32 @@ load them now: # Load required libraries library(BioCroValidation) library(BioCro) +library(lattice) ``` # Building the Objective Function In this section, we will use the `objective_function` function from `BioCroValidation` package to create an objective function that can be used to -parameterize Soybean-BioCro. +parameterize Soybean-BioCro. For more details about this, please see the help +page for `objective_function` by typing `?objective_function` from an R +terminal. + +## The Base Model Definition + +We first need a base model definition that includes the necessary modules, +initial values, parameters, and differential equation solver specifications. For +this example, we will simply use Soybean-BioCro as the base model, with just one +small change: we will use an Euler solver rather than the default solver, which +will help make the optimization run faster. For reasonable sets of parameter +values, the Euler solver does not seem to cause any substantial errors when +running Soybean-BioCro. + +```{r base_model_definition} +# Specify the base model definition +base_model_definition <- soybean +base_model_definition$ode_solver <- default_ode_solvers[['homemade_euler']] +``` ## The Observed Data @@ -83,6 +104,7 @@ which will be different for biomass values and standard deviations: # Define a helping function for calculating shell biomass add_shell_biomass <- function(data_table) { within(data_table, { + # The shell is all parts of the pod other than the seed Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha }) } @@ -90,11 +112,63 @@ add_shell_biomass <- function(data_table) { # Define a helping function for calculating shell biomass standard deviation add_shell_stdev <- function(data_table) { within(data_table, { + # Add uncertainties in quadrature, a simple approach to error propagation Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) }) } ``` +Although the observations do not include root biomass, it is nevertheless +important to constrain the predicted root mass to reasonable values. To do this, +it is assumed that the maximum root mass is seventeen percent of the maximum +aboveground biomass, and that it is achieved at the same time as maximum +above-ground biomass, based on observations reported in @ordonez_root_2020. In +the observed data, the sum of stem and leaf mass is largest at the fifth time +point in both years. So, root mass is estimated at this single time point and +added to the observed values. + +In previous parameterizations, a standard deviation for the root mass was not +explicitly estimated; instead, the standard-deviation-based weight factor was +implicitly set to 1. Because the `'logarithm'` method was used, a weight factor +of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See +the documentation page (`?objective_function`) for more information about this +weighting method. + +The root mass and its standard deviation can be added with another set of helper +functions. Note that any observations set to `NA` will be ignored when +calculating the error metric. + +```{r get_root} +# Define a helping function for adding the root mass +add_root_biomass <- function(data_table) { + # Initialize all values to NA + data_table$Root_Mg_per_ha <- NA + + # Estimate a mass at one time point + row_to_use <- 5 + + col_to_add <- c( + 'Leaf_Mg_per_ha', + 'Stem_Mg_per_ha', + 'Rep_Mg_per_ha' + ) + + data_table[row_to_use, 'Root_Mg_per_ha'] <- + 0.17 * sum(data_table[row_to_use, col_to_add]) + + data_table +} + +# Define a helping function for adding the root standard deviation +add_root_stdev <- function(data_table) { + # We can set a value for each time point; any time points where the root mass + # is NA will be ignored + data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + + data_table +} +``` + Finally, the data set includes some values that are not needed for the parameterization. This includes the leaf litter accumulated between each harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been @@ -105,6 +179,7 @@ helper function: # Define a helping function for removing unneeded columns remove_extra_columns <- function(data_table) { within(data_table, { + # Remove columns by setting them to NULL DOY = NULL Rep_Mg_per_ha = NULL Litter_Mg_per_ha = NULL @@ -115,40 +190,26 @@ remove_extra_columns <- function(data_table) { Now we can apply these to each table in the set: ```{r process_tables} -# Process the data sets -ambient_2002_biomass <- remove_extra_columns( - add_shell_biomass( - convert_time( - soyface_biomass[['ambient_2002']] - ) - ) -) - - -ambient_2005_biomass <- remove_extra_columns( - add_shell_biomass( - convert_time( - soyface_biomass[['ambient_2005']] - ) - ) -) - -ambient_2002_stdev <- remove_extra_columns( - add_shell_stdev( - convert_time( - soyface_biomass[['ambient_2002_std']] - ) - ) -) - - -ambient_2005_stdev <- remove_extra_columns( - add_shell_stdev( - convert_time( - soyface_biomass[['ambient_2005_std']] - ) - ) -) +# Process the data sets (biomass and stdev from 2002 and 2005) +ambient_2002_biomass <- + remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( + soyface_biomass[['ambient_2002']] + )))) + +ambient_2005_biomass <- + remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( + soyface_biomass[['ambient_2005']] + )))) + +ambient_2002_stdev <- + remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( + soyface_biomass[['ambient_2002_std']] + )))) + +ambient_2005_stdev <- + remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( + soyface_biomass[['ambient_2005_std']] + )))) ``` ## The Data-Driver Pairs @@ -178,27 +239,286 @@ data_driver_pairs <- list( Here we have chosen equal weights for the two years. +## The Post-Processing Function + +The observed data includes values of the total litter, which is comprised of +both leaf and stem litter. However, the model does not calculate this quntity; +instead, it returns separate values of leaf and stem litter. To address this +issue, we can provide a "post-processing function." This is an (optional) +function that is applied to each simulation result and can be used to add new +columns. Here we define such a function, which adds a new column for the total +litter: + +```{r post_process_function} +# Define the post-processing function +post_process_function <- function(sim_res) { + # Calculate the total litter as the sum of leaf and stem litter + within(sim_res, {TotalLitter = LeafLitter + StemLitter}) +} +``` + +## The Data Definitions + +The data sets above have columns whose names do not match the corresponding +model outputs. For example, the `Leaf_Mg_per_ha` column of the observed data +must be compared to the `Leaf` column of the model output, since both represent +the leaf mass per unit ground area. To handle this mismatch, we can provide a +set of "data definitions" that specify which columns should be compared: + +```{r data_definitions} +# Define the data definition list, where the element names are columns in the +# observed data tables, and the element values are the corresponding column +# names in the model outputs +data_definitions <- list( +# Observed Simulated + CumLitter_Mg_per_ha = 'TotalLitter', + Leaf_Mg_per_ha = 'Leaf', + Root_Mg_per_ha = 'Root', + Seed_Mg_per_ha = 'Grain', + Shell_Mg_per_ha = 'Shell', + Stem_Mg_per_ha = 'Stem' +) +``` + +## The Arguments to Vary + +Here we wish to vary several parameters related to carbon partitioning for +growth, senescence, maintenance respiration, and growth respiration: + +- For each growing tissue, there are two parameters (\(\alpha\) and \(\beta\)) + that influence the parbon partitioning coefficients. Here we will vary these + for the leaf, stem, and shell (6 parameters in total). + +- For each senescing tissue, there are three parameters (\(\alpha_{sen}\), + \(\beta_{sen}\), and `rate`) that influence when senescence begins and + the overall rate of scenescence. Here we will vary these for the leaf and stem + (6 parameters in total). + +- For each growing tissue, there is one parameter (`grc`) that influences the + rate of carbon use for growth respiration. Here we will vary these for the + stem and root (2 parameters in total). + +- For each tissue, there is one parameter (`mrc`) that influences the rate of + carbon use for maintenance respiration. Here we will vary these for the leaf, + stem, and root (3 parameters in total). + +Together, this is 17 arguments to vary. Typically, an optimization problem +requires more time for each free parameter involved, so it is helpful to vary +the smallest possible set. One way to reduce the number of free parameters is to +treat some as being "dependent." In other words, to calculate the values of some +parameters from the values of others, so that only some of them are truly free +or "independent." Here we will do this by fixing the value of `mrc_stem` to the +value of `mrc_leaf`. Thus, we can think of this is a single maintenance +respiration coefficient for the entire shoot; this reduces the number of +independent parameters by one (to 16). + +The independent arguments must be specified as a list of named numeric elements, +where the name is the argument name and the value is an initial guess for that +argument. Here we will use the default Soybean-BioCro values as our initial +guesses: + +```{r independent_args} +# Define a list of independent arguments and their initial values +independent_arg_names <- c( + # Partitioning for leaf, stem, and shell + 'alphaLeaf', + 'betaLeaf', + 'alphaStem', + 'betaStem', + 'alphaShell', + 'betaShell', + + # Senescence for leaf and stem + 'alphaSeneLeaf', + 'betaSeneLeaf', + 'rateSeneLeaf', + 'alphaSeneStem', + 'betaSeneStem', + 'rateSeneStem', + + # Growth respiration for stem and root + 'grc_stem', + 'grc_root', + + # Maintenance respiration for leaf and root + 'mrc_leaf', + 'mrc_root' +) + +independent_args <- soybean$parameters[independent_arg_names] +``` + +The dependent arguments must be specified as a function that takes a list of +independent arguments as its input, and returns a list of dependent arguments as +its output: + +```{r dependent_arg_function} +# Define a function that sets `mrc_stem` to the value of `mrc_leaf` +dependent_arg_function <- function(ind_args) { + list(mrc_stem = ind_args[['mrc_leaf']]) +} +``` + +## The Quantity Weights + +When determining the error metric value, we wish to assign different weights to +each type of observed value. This can be handled via the `quantity_weights`, +which must be a list of named numeric elements, where the name of each element +is an output from the simulation, and its value is the weight. + +```{r quantity_weights} +# Specify the quantity weights; there is no systematic way to determine these, +# but the following weights have worked well in the past for Soybean-BioCro +quantity_weights <- list( + Grain = 1.0, + Leaf = 1.0, + Root = 0.1, + Shell = 0.5, + Stem = 1.0, + TotalLitter = 0.1 +) +``` + +## The Extra Penalty Function + +Sometimes an optimizer may choose parameter values that produce close agreement +with the observed data but are nevertheless unreasonable from a biological +perspective. + +To prevent these unreasonable parameters from being chosen, "extra penalties" +can be added to the error metric. These penalties can be specified using an +`extra_penalty_function`, which must take the result from a BioCro simulation +as its input and return a numeric error penalty value, which generally should be +zero (when no issues are found) or a large positive number (if an issue has been +found). + +For Soybean-BioCro parameterization, three common issues are that: + +1. Carbon is never partitioned to one or more key tissues. + +2. Carbon partitioning to the stem and leaf starts at different times. + +3. Carbon partitioning to the leaves begins too early or too late. + +The function below will return a large value when any of these situations +occurs, and will otherwise return a value of zero. + +```{r extra_penalty_function} +# Define an extra penalty function +extra_penalty_function <- function(sim_res) { + # Set the penalty value + PENALTY <- 9999 + + # Get the first times when each partitioning coefficient becomes non-zero + k_thresh <- 0.01 # Threshold k value to decide when growth has started + hpd <- 24.0 # Hours per day + + time <- sim_res[['time']] + + time_grain <- time[sim_res[['kGrain']] > k_thresh][1] + time_leaf <- time[sim_res[['kLeaf']] > k_thresh][1] + time_shell <- time[sim_res[['kShell']] > k_thresh][1] + time_stem <- time[sim_res[['kStem']] > k_thresh][1] + + # Return a penalty if necessary + if (is.na(time_grain) | is.na(time_leaf) | is.na(time_shell) | is.na(time_stem)) { + # One or more tissues is not growing + return(PENALTY) + } else if (abs(time_leaf - time_stem) > 5 * hpd) { + # The starts of leaf and stem growth are more than 5 days apart + return(PENALTY) + } else if (time_leaf - time[1] > 20 * hpd | time_leaf - time[1] < 10 * hpd) { + # The start of leaf growth is too late (more than 20 days after sowing) or + # too early (fewer than 10 days after sowing) + return(PENALTY) + } else { + # No problems were detected + return(0.0) + } +} +``` + +## The Objective Function + +Now we are just about ready to build the objective function. There are a few +more details to discuss: + +- Soybean-BioCro has always used the `'mean_max'` method for determining + normalization factors; see Equations 14-16 of @matthews_soybean_biocro_2021 + for more details. + +- Soybean-BioCro has always used the `'logarithm'` method for determining + weights from standard deviations; see Equation 17 of + @matthews_soybean_biocro_2021 for more details. + +- Soybean-BioCro has not used any regularization. + +With this, it is possible to build the function. Note that some useful +information is printed out when the function is created, such as the full list +of observed values and their corresponding weights. + +```{r objective_function} +# Create the objective function +obj_fun <- objective_function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = data_definitions, + normalization_method = 'mean_max', + stdev_weight_method = 'logarithm', + regularization_method = 'none', + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function, + extra_penalty_function = extra_penalty_function +) +``` + # Commands From This Document ```{r, eval = FALSE} +### +### Preliminaries +### + <> ### ### Define helping functions ### -<<>> +<> <> -<> +<> + +<> ### -### Prepare inputs for `objective_function` +### Prepare inputs for `objective_function` and call it ### +<> + +<> + <> +<> + +<> + +<> + +<> + +<> + +<> + +<> + ``` # References diff --git a/vignettes/references.bib b/vignettes/references.bib index fec18e3..4534ebb 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -44,3 +44,14 @@ @article{he_connecting_2024 month = {06}, doi = {10.1093/insilicoplants/diae009}, } + +@article{ordonez_root_2020, + title = {Root to shoot and carbon to nitrogen ratios of maize and soybean crops in the {US} {Midwest}}, + volume = {120}, + doi = {10.1016/j.eja.2020.126130}, + journal = {European Journal of Agronomy}, + author = {Ordóñez, Raziel A. and Archontoulis, Sotirios V. and Martinez-Feria, Rafael and Hatfield, Jerry L. and Wright, Emily E. and Castellano, Michael J.}, + month = oct, + year = {2020}, + pages = {126130}, +} From efce402a206a3fb4c853f8a2e48cd517716a38a0 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 10:46:25 -0500 Subject: [PATCH 43/54] Add more error checking Make sure we can properly handle when a predicted value is NA, and that an error is thrown when an input argument value is NA. --- R/objective_function_helpers.R | 21 +++++++--- tests/testthat/test-objective_function.R | 52 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 3805ce8..c2d1829 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -62,6 +62,13 @@ get_model_runner <- function( c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) } + if (any(!is.finite(x_for_partial))) { + stop( + 'At least one independent or dependent argument ', + 'value is not finite' + ) + } + initial_res <- partial_func(x_for_partial) if (is.null(post_process_function)) { @@ -272,13 +279,17 @@ one_error <- function( normalization ) { - qw <- if (predicted < observed) { - quantity_weight[1] # Underprediction + if (!is.finite(predicted)) { + NA } else { - quantity_weight[2] # Overprediction - } + qw <- if (predicted < observed) { + quantity_weight[1] # Underprediction + } else { + quantity_weight[2] # Overprediction + } - (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization + (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization + } } # Helping function for returning a failure value diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index d0fe2c3..91bdf60 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -382,6 +382,7 @@ test_that('Bad variance methods are detected', { }) test_that('Bad return values are detected', { + # A penalty evaluates to NA expect_error( objective_function( model, @@ -395,6 +396,20 @@ test_that('Bad return values are detected', { ), 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) + + # A predicted value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, {Pod = NA})}, + verbose_startup = verbose_startup + ), + 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' + ) }) test_that('Bad regularization methods are detected', { @@ -437,3 +452,40 @@ test_that('Bad data values and weights are detected', { fixed = TRUE ) }) + +test_that('NA argument values and predicted values are handled', { + # An independent argument value is NA + expect_error( + objective_function( + model, + ddps, + within(independent_args, {alphaLeaf = NA}), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) + + # A dependent argument value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + dependent_arg_function = function(x) {list(alphaStem = NA)}, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) +}) From d50a43125f86050b66488e9a0ae3276e08f1e991 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 11:34:54 -0500 Subject: [PATCH 44/54] Add debug mode option --- R/objective_function_helpers.R | 38 +++++++++++++++++++++++++++++++--- man/objective_function.Rd | 32 +++++++++++++++++----------- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index c2d1829..ce32bef 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -403,7 +403,18 @@ get_obj_fun <- function( regularization_method ) { - function(x, lambda = 0, return_terms = FALSE) { + function(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE) { + if (debug_mode) { + msg <- paste0( + '\nTime: ', + Sys.time(), + ' Independent argument values: ', + paste(x, collapse = ', '), + '\n' + ) + cat(msg) + } + errors <- lapply(seq_along(model_runners), function(i) { runner <- model_runners[[i]] res <- runner(x) @@ -422,15 +433,36 @@ get_obj_fun <- function( reg_penalty <- regularization_penalty(x, regularization_method, lambda) if (return_terms) { - list( + error_metric_terms <- list( terms_from_data_driver_pairs = stats::setNames( errors, names(model_runners) ), regularization_penalty = reg_penalty ) + + if (debug_mode) { + cat(paste0('Time: ', Sys.time()), ' Error metric terms: ') + utils::str(error_metric_terms) + cat('\n') + } + + error_metric_terms } else { - sum(as.numeric(errors)) + reg_penalty + error_metric <- sum(as.numeric(errors)) + reg_penalty + + if (debug_mode) { + msg <- paste0( + 'Time: ', + Sys.time(), + ' Error metric: ', + error_metric, + '\n' + ) + cat(msg) + } + + error_metric } } } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index b606011..4693d9c 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -468,7 +468,7 @@ \value{ A function \code{obj_fun} with signature - \code{obj_fun(x, lambda = 0, return_terms = FALSE)}. + \code{obj_fun(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE)}. Here, \code{x} is a numeric vector of values of the independent arguments (in the same order as in \code{independent_arg_names}), and \code{lambda} is the @@ -484,6 +484,15 @@ Setting it to \code{TRUE} can be useful for troubleshooting, or for diagnostics such as checking the quality of fit for each of the data-driver pairs. + + The \code{debug_mode} argument determines whether \code{obj_fun} is running in + debug mode. In debug mode, each time \code{obj_fun} is called, it will print + the values of \code{x} and the error metric to the R terminal. This can be + useful when troubleshooting a problem with an optimization, since it provides + a record of any problematic parameter combinations. When setting + \code{debug_mode} to \code{TRUE}, also consider using \code{\link[base]{sink}} + to write the outputs to a file instead of the R terminal. In that case, there + will still be a record even if R crashes. } \examples{ @@ -609,16 +618,15 @@ if (require(BioCro)) { # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between - # the model and the observed data. - cat('\nError metric calculated by doubling the initial argument values:\n\n') - print( - obj_fun(2 * as.numeric(independent_args)) - ) - - # We can also see the values of each term that makes up the error metric - cat('\nError metric terms calculated by doubling the initial argument values:\n\n') - str( - obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) - ) + # the model and the observed data. Here we will call `obj_fun` in debug mode, + # which will automatically print the value of the error metric. + cat('\nError metric calculated by doubling the initial argument values:\n') + error_metric <- obj_fun(2 * as.numeric(independent_args), debug_mode = TRUE) + + # We can also see the values of each term that makes up the error metric; + # again, we will call `obj_fun` in debug mode for automatic printing. + cat('\nError metric terms calculated by doubling the initial argument values:\n') + error_terms <- + obj_fun(2 * as.numeric(independent_args), return_terms = TRUE, debug_mode = TRUE) } } From 8e6374a031c7f0abdabab38d6efebefeef3d37f7 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 11:49:42 -0500 Subject: [PATCH 45/54] Add optimization example to vignette --- DESCRIPTION | 1 + vignettes/parameterizing_soybean_biocro.Rmd | 231 +++++++++++++++++++- 2 files changed, 230 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 72f0f01..607df58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Depends: Imports: BioCro (>= 3.2.0) Suggests: + dfoptim, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index a0abc0d..9f5d540 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -45,7 +45,7 @@ load them now: # Load required libraries library(BioCroValidation) library(BioCro) -library(lattice) +library(dfoptim) ``` # Building the Objective Function @@ -474,6 +474,217 @@ obj_fun <- objective_function( ) ``` +# Optimizing the Parameter Values + +The objective function is designed to be passed to a minimization algorithm, +which will determine the argument values that produce the best agreement between +the model predictions and the observations. + +Soybean-BioCro has already been parameterized, so there is already good +agreement between the model and the data. This can be seen by examining the +value of the error metric when using the default Soybean-BioCro values: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +default_error <- obj_fun(as.numeric(independent_args)) +``` + +This evaluates to `r default_error`. This is a low value for a Soybean-BioCro +parameterization, indicating that good agreement has already been found. + +Here, as an example, we will intentionally change each parameter value by a +small random amount, and then use an optimizer to improve the parameter values; +in an ideal world, the optimizer will eventually pick parameter values close to +the original Soybean-BioCro values. + +There are many optimizers available in R. Base R includes the `optim` function, +and others are available from the `dfoptim` and `DEoptim` packages. Here we will +use the `nmkb` optimizer from the `dfoptim` library, which requires upper and +lower bounds for each parameter and an initial guess. + +## Choosing an Initial Guess + +As mentioned above, we will intentionally choose a "bad" initial guess by +tweaking each parameter value by a small random amount. Note that we set a seed +to ensure the same result is obtained every time this is performed. Also note +that the initial guess must be a numeric vector, where the elements are ordered +as they are in `independent_args`. + +```{r initial_guess} +# Set a seed +set.seed(1234) + +# Make an initial guess by perturbing the default values by a small amount +rel_size <- 0.02 + +initial_guess <- as.numeric(independent_args) * + (1.0 + runif(length(independent_args), -rel_size, rel_size)) +``` + +Even though the changes to parameter values are small, there is still a +substantial change in the value of the error metric: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +initial_error <- obj_fun(initial_guess) +``` + +This evaluates to `r initial_error`, which is about +`r round(100 * (initial_error - default_error) / initial_error)` percent larger +than with the default parameter values. + +## Choosing Lower and Upper Bounds + +There is not always a systematic approach to choosing lower and upper bounds for +parameter values, but the following bounds have worked well for Soybean-BioCro +in the past: + +- The \(\alpha\) parameters used in partitioning and senescence calculations are + confined to the interval [0, 50]. + +- The \(\beta\) parameters used in partitioning and senescence calculations are + confined to the interval [-50, 0]. + +- The senescence rates have a lower bound of zero, but have different upper + bounds for each tissue. + +- The maintenance respiration coefficients are confined to the interval + [1e-6, 1e-2]. + +- The growth respiration coefficients must be positive and non-zero, but have + different bounds for each tissue. + +There are many possible ways to specify the bounds in R, but ultimately they +must be expressed as numeric vectors, where the elements are ordered as they are +in `independent_args`. Here we supply lower and upper bounds for each parameter +in a data frame and then ensure they are properly ordered. Later, the data frame +columns can be passed to the optimizer as the bounds: + +```{r setting_bounds} +# Define a table with the bounds +bounds <- read.table( + textConnection(' + arg_name lower upper + alphaLeaf +0 +50 + alphaStem +0 +50 + alphaShell +0 +50 + alphaSeneLeaf +0 +50 + alphaSeneStem +0 +50 + betaLeaf -50 +0 + betaStem -50 +0 + betaShell -50 +0 + betaSeneLeaf -50 +0 + betaSeneStem -50 +0 + rateSeneLeaf +0 +0.0125 + rateSeneStem +0 +0.005 + mrc_leaf +1e-6 +1e-2 + mrc_root +1e-6 +1e-2 + grc_stem +8e-4 +0.08 + grc_root +0.0025 +0.075 + '), + header = TRUE, + stringsAsFactors = FALSE +) + +# Make sure it is ordered properly +bounds <- bounds[match(independent_arg_names, bounds[['arg_name']]), ] +``` + +## Running the Optimizer + +Now we will use an optimizer to improve on the initial guess. As mentioned +above, we will use the `nmkb` optimizer from the `dfoptim` package. This is a +good choice when a decent starting guess is known. If a broader search is +necessary, `DEoptim` from the `DEoptim` package may be more appropriate, +although it generally needs more time to run. + +To make sure this example does not take too much time, we will use a loose +tolerance; a more realistic example would probably use `1e-4` or `1e-5`. + +```{r run_optimizer, eval = FALSE} +# Run the optimizer +optim_res <- nmkb( + initial_guess, + obj_fun, + bounds[['lower']], + bounds[['upper']], + control = list( + tol = 1e-2, + restarts.max = 10 + ), + debug_mode = FALSE # Passed to obj_fun; set to TRUE to enable debug mode +) +``` + +```{r, echo = FALSE} +timing <- system.time({ +<> +}) +``` + +When this document was generated, running the optimizer required the following +amount of time: + +```{r, echo = FALSE} +print(timing) +``` + +The total time was about `r round(timing[3] / 60, 2)` minutes. For a real +parameterization problem, it can be many times longer, and may even need days to +run on a personal laptop computer. + +The optimizer also reports how many times the objective function was called, +among other details: + +```{r} +str(optim_res) +``` + +The value of `feval` is `r optim_res[['feval']]`, so on average, each call of +the objective function required approximately +`r round(timing[3] / optim_res[['feval']], 3)` seconds. + +## Comparing Parameter Values + +Let's see whether the optimized parameters are closer to the default parameters +than the initial guess was. + +```{r compare_param} +# Create a table of the various independent argument values +ind_arg_table <- data.frame( + arg_name = independent_arg_names, + defaults = as.numeric(independent_args), + initial_guess = initial_guess, + optimized = optim_res[['par']], + stringsAsFactors = FALSE +) + +# Add differences +ind_arg_table <- within(ind_arg_table, { + initial_diff = abs(initial_guess - defaults) + optimized_diff = abs(optimized - defaults) + improved = optimized_diff < initial_diff +}) + +# View results +print(ind_arg_table) +``` + +In this table, when the `improved` column is `TRUE`, this means that the +optimized parameter value is closer to the default value than the initial guess +was; in other words, it means that the optimizer truly improved on the initial +guess. In this example, `r sum(ind_arg_table[['improved']])` out of +`r nrow(ind_arg_table)` parameter estimates were improved +(`r round(100 * sum(ind_arg_table[['improved']]) / nrow(ind_arg_table))` percent). + +We can also compare the error metric to its original value. As shown above, it +is now `r optim_res[['value']]`, which is only +`r round(100 * (optim_res[['value']] - default_error) / optim_res[['value']], 1)` +percent larger than with the default parameter values. + +The optimized parameter values could likely be improved by using a more +stringent tolerance for the optimizer, but this would require more time to run. + # Commands From This Document ```{r, eval = FALSE} @@ -496,7 +707,7 @@ obj_fun <- objective_function( <> ### -### Prepare inputs for `objective_function` and call it +### Prepare inputs for `objective_function` and create an objective function ### <> @@ -519,6 +730,22 @@ obj_fun <- objective_function( <> +### +### Use an optimizer to choose parameter values +### + +<> + +<> + +<> + +### +### Check and record the new values +### + +<> + ``` # References From a64c7cd64da317163bd398b5e4dbe2d1079de374 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 16:15:43 -0500 Subject: [PATCH 46/54] Add `update_model` and finish vignette example --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 8 +- R/update_model.R | 80 ++++++++++++ man/update_model.Rd | 70 +++++++++++ tests/testthat/test-update_model.R | 83 +++++++++++++ vignettes/parameterizing_soybean_biocro.Rmd | 128 ++++++++++++++++++++ 7 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 R/update_model.R create mode 100644 man/update_model.Rd create mode 100644 tests/testthat/test-update_model.R diff --git a/DESCRIPTION b/DESCRIPTION index 607df58..4dfa230 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,7 @@ Imports: BioCro (>= 3.2.0) Suggests: dfoptim, + lattice, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/NAMESPACE b/NAMESPACE index ba5cd56..ad14bd8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,3 @@ export(objective_function) +export(update_model) export(write_model) diff --git a/NEWS.md b/NEWS.md index 8141b4c..b13d94e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,7 +37,13 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. -- Added the `objective_function` function. +- Added several new functions: + + - `objective_function` + + - `update_model` + +- Added a vignette illustrating how to perform a model parameterization. # Changes in BioCroValidation Version 0.1.0 diff --git a/R/update_model.R b/R/update_model.R new file mode 100644 index 0000000..1a5d32a --- /dev/null +++ b/R/update_model.R @@ -0,0 +1,80 @@ +update_model <- function( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL +) +{ + # Make sure the model definition has initial_values and parameters + req_elements <- c('initial_values', 'parameters') + if (!all(req_elements %in% names(base_model_definition))) { + stop( + 'The `base_model_definition` must have the following elements: ', + paste(req_elements, collapse = ', ') + ) + } + + # Make sure the argument lists have the same length + if (length(independent_args) != length(new_ind_arg_values)) { + stop('`independent_args` and `new_ind_arg_values` must have the same length') + } + + # Update the values of the independent arguments + new_independent_args <- stats::setNames( + as.list(new_ind_arg_values), + names(independent_args) + ) + + # Also get the values of the dependent arguments + all_args <- if (!is.null(dependent_arg_function)) { + c(new_independent_args, dependent_arg_function(new_independent_args)) + } else { + new_independent_args + } + + # Find all quantities in the initial values and parameters and store them in + # a list + iv <- as.list( + rep_len( + 'initial_values', + length(base_model_definition[['initial_values']]) + ) + ) + names(iv) <- names(base_model_definition[['initial_values']]) + + param <- as.list( + rep_len( + 'parameters', + length(base_model_definition[['parameters']]) + ) + ) + names(param) <- names(base_model_definition[['parameters']]) + + model_quantities <- c(iv, param) + + # Make sure each supplied argument is included in the model + not_in_model <- !names(all_args) %in% names(model_quantities) + + if (any(not_in_model)) { + msg <- paste0( + 'Values were supplied for the following quantities, but they ', + 'are not `initial_values` or `parameters` of ', + 'the `base_model_definition`: ', + paste(names(all_args)[not_in_model], collapse = ', ') + ) + stop(msg) + } + + # Make a copy of the model with the new argument values and return it + new_model_definition <- base_model_definition + + for (i in seq_along(all_args)) { + arg_name <- names(all_args)[i] + arg_type <- model_quantities[[arg_name]] + arg_value <- all_args[[i]] + + new_model_definition[[arg_type]][[arg_name]] <- arg_value + } + + new_model_definition +} diff --git a/man/update_model.Rd b/man/update_model.Rd new file mode 100644 index 0000000..f3a86ae --- /dev/null +++ b/man/update_model.Rd @@ -0,0 +1,70 @@ +\name{update_model} + +\alias{update_model} + +\title{Update a BioCro model definition} + +\description{ + Following an optimization, it is typically necessary to update the initial + values and/or parameters of a base model definition with new values determined + during the optimization. The \code{update_model} function helps to streamline + this process. It is expected that this function will be called after + \code{\link{objective_function}}. +} + +\usage{ + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL + ) +} + +\arguments{ + \item{base_model_definition}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{new_ind_arg_values}{ + A numeric vector representing new values of the independent arguments, + typically determined by an optimizer. + } + + \item{dependent_arg_function}{ + The same value that was passed to \code{\link{objective_function}}. + } +} + +\value{ + A list based on \code{base_model_definition} but with new values of some of + its \code{initial_values} and \code{parameters}, as specified by the elements + of \code{independent_args} and \code{new_ind_arg_values}. +} + +\examples{ +if (require(BioCro)) { + # Update the default Soybean-BioCro model with new values of `Leaf` (an + # initial value) and `alphaStem` (a parameter) + base_model <- BioCro::soybean + + new_model <- update_model( + base_model, + list(Leaf = 1, alphaLeaf = 2), # The values here will not be used + c(1000, 2000) # These are the actual new values + ) + + # Compare the two models + cat('\n\nComparing initial Leaf values:\n') + print(base_model$initial_values$Leaf) + print(new_model$initial_values$Leaf) + + cat('\n\nComparing alphaLeaf values:\n') + print(base_model$parameters$alphaLeaf) + print(new_model$parameters$alphaLeaf) +} +} diff --git a/tests/testthat/test-update_model.R b/tests/testthat/test-update_model.R new file mode 100644 index 0000000..93d74eb --- /dev/null +++ b/tests/testthat/test-update_model.R @@ -0,0 +1,83 @@ +NEWVAL <- 10000 + +base_model_definition <- BioCro::soybean +independent_args <- list(alphaLeaf = NEWVAL, Leaf = NEWVAL) +new_ind_arg_values <- c(NEWVAL, NEWVAL) +dependent_arg_function <- function(x) {list(alphaStem = x$alphaLeaf)} + +test_that('A base model definition can be updated', { + # Without dependent arguments + expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values + ) + ) + + # With dependent arguments + new_model <- expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ) + ) + + # Initial values are updated + expect_equal( + new_model[['initial_values']][['Leaf']], + NEWVAL + ) + + # Other initial values remain the same + expect_equal( + new_model[['initial_values']][['Stem']], + base_model_definition[['initial_values']][['Stem']] + ) + + # Dependent parameters are updated + expect_equal( + new_model[['parameters']][['alphaStem']], + NEWVAL + ) + + # Other parameters remain the same + expect_equal( + new_model[['parameters']][['alphaRoot']], + base_model_definition[['parameters']][['alphaRoot']] + ) +}) + +test_that('Base model definition must be valid', { + expect_error( + update_model( + base_model_definition[c('direct_modules', 'differential_modules')], + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ), + 'The `base_model_definition` must have the following elements: initial_values, parameters' + ) +}) + +test_that('Supplied arguments must be part of the base model definition', { + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + new_ind_arg_values + ), + '`independent_args` and `new_ind_arg_values` must have the same length' + ) + + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + c(new_ind_arg_values, 25) + ), + 'Values were supplied for the following quantities, but they are not `initial_values` or `parameters` of the `base_model_definition`: bad_arg' + ) +}) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 9f5d540..5be249e 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -46,6 +46,7 @@ load them now: library(BioCroValidation) library(BioCro) library(dfoptim) +library(lattice) ``` # Building the Objective Function @@ -685,6 +686,125 @@ percent larger than with the default parameter values. The optimized parameter values could likely be improved by using a more stringent tolerance for the optimizer, but this would require more time to run. +# Comparing Model Outputs + +Another way to evaluate the results of the optimization is to compare +simulations using the default, perturbed, and re-optimized versions of the +model. + +Following the re-parameterization, we now have new values of the independent +arguments, but this is not enough to actually run the new version of the model. +Thus, the next step is to form a new model definition by updating the values of +the default soybean model. This can be accomplished using the `update_model` +function from `BioCroValidation`: + +```{r form_new_models} +# Get model definition lists for the perturbed and re-parameterized versions of +# the soybean model +soybean_perturbed <- update_model( + BioCro::soybean, + independent_args, + initial_guess, + dependent_arg_function = dependent_arg_function +) + +soybean_reparam <- update_model( + BioCro::soybean, + independent_args, + optim_res[['par']], + dependent_arg_function = dependent_arg_function +) +``` + +We can check that the three models have different values of key parameters, such +as the "dependent" argument `mrc_stem`: + +```{r} +print(BioCro::soybean$parameters$mrc_stem) + +print(soybean_perturbed$parameters$mrc_stem) + +print(soybean_reparam$parameters$mrc_stem) +``` + +Now we can run each version of the model for a single year and visually compare +their outputs: + +```{r comparison_plots} +# Define a helper function that runs a single model for the year 2005 +run_2005 <- function(model_definition) { + with(model_definition, {run_biocro( + initial_values, + parameters, + soybean_weather[['2005']], + direct_modules, + differential_modules, + ode_solver + )}) +} + +# Run each model and combine the results +full_res <- rbind( + within(run_2005(BioCro::soybean), {model = 'Default Soybean-BioCro'}), + within(run_2005(soybean_perturbed), {model = 'Perturbed Soybean-BioCro'}), + within(run_2005(soybean_reparam), {model = 'Re-parameterized Soybean-BioCro'}) +) + +# Plot the results +xyplot( + Leaf + Stem + Root + Grain ~ fractional_doy, + group = model, + data = full_res, + type = 'l', + auto.key = list(space = 'top'), + xlab = 'Day of year (2005)', + ylab = 'Biomass (Mg / ha)' +) +``` + +Here we can see that while the simulated stem, root, and grain (seed) biomass +values do not differ much between the models, there are large differences in the +leaf mass, where the default and re-optimized versions are similar and the +perturbed version predicts much smaller values. + +## Saving the New Model Definition + +A realistic parameterization takes a long time to complete, so it is important +to save the results for later use. One approach is to save the model definition +list using the `save` or `saveRDS` functions from base R. However, these options +create binary files that are not human-readable, and they cannot be easily +tracked using `git`. As an alternative, the `BioCroValidation` includes a +function called `write_model` that forms a string representing an R command that +can be called to re-create a model definition. This command string can be +written to a text file, making it easy to read and to track with `git`. + +Here we apply `write_model` to the re-optimized soybean model: + +```{r write_model_command} +# Convert the re-parameterized soybean model to an R command string +r_cmd_string <- with(soybean_reparam, write_model( + 'soybean_reparam', + direct_modules, + differential_modules, + initial_values, + parameters, + ode_solver +)) +``` + +We can view the resulting R command string: + +```{r} +writeLines(r_cmd_string) +``` + +It can also be written to a text file: + +```{r write_model_to_file, eval = FALSE} +# Save the model definition as an R file in the current working directory +writeLines(r_cmd_string, './soybean_reparam.R') +``` + # Commands From This Document ```{r, eval = FALSE} @@ -746,6 +866,14 @@ stringent tolerance for the optimizer, but this would require more time to run. <> +<> + +<> + +<> + +<> + ``` # References From 388dc5bb144f9cac54e47f9e7c13c2c55a221501 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 18:16:12 -0500 Subject: [PATCH 47/54] Add `bounds_table` and use it in the vignette --- NAMESPACE | 1 + NEWS.md | 7 +- R/bounds_table.R | 154 ++++++++++++++++++++ man/bounds_table.Rd | 118 +++++++++++++++ tests/testthat/test-bounds_table.R | 98 +++++++++++++ vignettes/parameterizing_soybean_biocro.Rmd | 63 ++++---- 6 files changed, 407 insertions(+), 34 deletions(-) create mode 100644 R/bounds_table.R create mode 100644 man/bounds_table.Rd create mode 100644 tests/testthat/test-bounds_table.R diff --git a/NAMESPACE b/NAMESPACE index ad14bd8..6c90842 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,3 +1,4 @@ +export(bounds_table) export(objective_function) export(update_model) export(write_model) diff --git a/NEWS.md b/NEWS.md index b13d94e..ab5e3fa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,11 +37,8 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. -- Added several new functions: - - - `objective_function` - - - `update_model` +- Added several new functions: `objective_function`, `update_model`, and + `bounds_table` - Added a vignette illustrating how to perform a model parameterization. diff --git a/R/bounds_table.R b/R/bounds_table.R new file mode 100644 index 0000000..2a7e407 --- /dev/null +++ b/R/bounds_table.R @@ -0,0 +1,154 @@ +# A helper function for checking the bounds list for mistakes; if an issue is +# found, this function will throw an error; otherwise it will be silent with no +# return value. +check_bounds_list <- function(bounds_list, independent_args) { + # Must be a list of named elements + if (!is.list(bounds_list) | is.null(names(bounds_list))) { + stop('`bounds_list` must be a list of named elements') + } + + # Must contain all elements in independent_args + missing_element <- sapply(names(independent_args), function(x) { + !x %in% names(bounds_list) + }) + + if (any(missing_element)) { + msg <- paste0( + 'The following elements were included in ', + '`independent_args` but not `bounds_list`: ', + paste(names(independent_args)[missing_element], collapse = ', ') + ) + stop(msg) + } + + # Each element must have length 2 + length_two <- sapply(bounds_list, function(x) { + xlen <- length(x) + + if (is.finite(xlen)) { + length(x) == 2 + } else { + FALSE + } + }) + + if (any(!length_two)) { + msg <- paste0( + 'The following elements of `bounds_list` do not have a length of 2: ', + paste(names(bounds_list)[!length_two], collapse = ', ') + ) + stop(msg) + } + + # Each element must be numeric + not_numeric <- sapply(bounds_list, function(x) {!is.numeric(x)}) + + if (any(not_numeric)) { + msg <- paste0( + 'The following elements of `bounds_list` are not numeric: ', + paste(names(bounds_list)[not_numeric], collapse = ', ') + ) + stop(msg) + } + + return(invisible(NULL)) +} + +# A helper function for checking the initial guess for mistakes; if an issue is +# found, this function will throw an error or a warning; otherwise it will be +# silent with no return value. +check_initial_ind_arg_values <- function( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values +) +{ + # Check the length + if (length(initial_ind_arg_values) != length(independent_args)) { + stop('`initial_ind_arg_values` must have the same length as `independent_args`') + } + + # Check to make sure the initial values are not outside the bounds + outside_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + initial_ind_arg_values[i] < lbounds[i] | initial_ind_arg_values[i] > ubounds[i] + }) + + if (any(outside_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie outside the bounds: ', + paste(names(independent_args)[outside_bounds], collapse = ', ') + ) + stop(msg) + } + + # Check to see if any initial values are on the bounds + eps <- sqrt(.Machine$double.eps) + + on_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + abs(initial_ind_arg_values[i] - lbounds[i]) < eps | + abs(initial_ind_arg_values[i] - ubounds[i]) < eps + }) + + if (any(on_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie on the ', + 'bounds, which can be problematic for some optimizers: ', + paste(names(independent_args)[on_bounds], collapse = ', ') + ) + warning(msg) + } + + return(invisible(NULL)) +} + +bounds_table <- function( + independent_args, + bounds_list, + initial_ind_arg_values = NULL +) +{ + # Check the bounds_list + check_bounds_list(bounds_list, independent_args) + + # Get an ordering for the elements of `bounds_list` so they match the order + # of elements in `independent_args`; note that this will also exclude any + # elements of `bounds_list` that are not included in `independent_args`. + ordering <- match( + names(independent_args), + names(bounds_list) + ) + + bounds_list <- bounds_list[ordering] + + # Get the lower and upper bounds + lbounds <- sapply(bounds_list, min) + ubounds <- sapply(bounds_list, max) + + # Form the bounds table + bounds_table <- data.frame( + arg_name = names(independent_args), + lower = lbounds, + upper = ubounds, + stringsAsFactors = FALSE + ) + + # Include initial values in the table if they were provided + if (!is.null(initial_ind_arg_values)) { + # Check the values + check_initial_ind_arg_values( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values + ) + + # Include the values + bounds_table$initial_value <- initial_ind_arg_values + } + + # Remove row names and return the table + rownames(bounds_table) <- NULL + + bounds_table +} diff --git a/man/bounds_table.Rd b/man/bounds_table.Rd new file mode 100644 index 0000000..b833c59 --- /dev/null +++ b/man/bounds_table.Rd @@ -0,0 +1,118 @@ +\name{bounds_table} + +\alias{bounds_table} + +\title{Create a table of lower and upper bounds} + +\description{ + During an optimization, it is often necessary to provide lower and upper + bounds for the parameters that are being varied. Typically they are specified + as numeric vectors, which often leads to confusing code, where the writer and + reader must remember which value corresponds to each argument; for example, + "the third element of the lower bound vector is for alphaLeaf." + + The purpose of \code{bounds_table} is to make the process of specifying bounds + simpler and easier to follow. It is expected that this function will be called + after \code{\link{objective_function}}. +} + +\usage{ + bounds_table( + independent_args, + bounds_list, + initial_ind_arg_values = NULL + ) +} + +\arguments{ + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{bounds_list}{ + A list of named elements, where each element is a numeric vector of length + 2. The names should correspond to the independent arguments, and the values + should indicate lower and upper bounds for the corresponding parameter (in + any order). Any "extra" bounds (that is, bounds that do not correspond to + any independent argument) will be ignored. + } + + \item{initial_ind_arg_values}{ + A numeric vector of initial values for each of the independent arguments, + supplied in the same order as in \code{independent_args}. + } +} + +\details{ + The main purpose of this function is to create vectors of lower and upper + bounds, which are returned as the columns of a data frame. For each + independent argument in \code{independent_args}, the bounds are supplied via + the \code{bounds_list} input. The syntax is designed so the code calling this + function is easy for a human to parse. (See example below.) + + It is also (optionally) possible to provide an initial guess for each + independent argument via the \code{initial_ind_arg_values} argument. When + provided, these will be checked to make sure they do not lie outside the + bounds; an error will be thrown if any do lie outside the bounds. A warning + will also be thrown if any initial guesses lie on the bounds, since this can + be problematic for some optimizers, such as \code{\link[dfoptim]{nmkb}}. +} + +\value{ + A data frame with three or four columns: \code{arg_name}, \code{lower}, + \code{upper}, and (optionally) \code{initial_value}. + + The \code{lower} and \code{upper} columns are the lower and upper bounds, + determined from \code{bounds_list}. The \code{arg_name} column is the argument + name, and the rows of the table are ordered as in \code{independent_args}. + The \code{initial_value} column contains initial values, if they were + provided via the \code{initial_ind_arg_values} argument. +} + +\examples{ +# Make a list of independent arguments; the values are not used for anything +independent_args <- list( + alphaLeaf = 0, + alphaRoot = 0, + alphaStem = 0, + betaLeaf = 0, + betaRoot = 0, + betaStem = 0 +) + +# Specify bounds and initial guess for each. Note that: +# +# - The bounds will be reordered to follow the same order as the +# `independent_args`, but the initial guess is assumed to already follow the +# same order as the `independent_args`. +# +# - The bounds for the two extra parameters are ignored when forming the table. +# +# - The lower and upper bounds can be supplied as (upper, lower) +# or (lower, upper) pairs. +# +b_ll <- -50 # Lower limit for beta parameters +a_ul <- 50 # Upper limit for alpha parameters + +bounds <- bounds_table( + independent_args, + list( + betaStem = c(0, b_ll), + betaRoot = c(0, b_ll), + betaLeaf = c(0, b_ll), + alphaStem = c(0, a_ul), + alphaRoot = c(0, a_ul), + alphaLeaf = c(0, a_ul), + extraPar1 = c(0, 5), + extraPar2 = c(0, 6) + ), + c(1, 1, 1, -1, -1, -1) +) + +print(bounds) + +# Now the properly-ordered lower and upper limits can be accessed as follows: +bounds$lower + +bounds$lower +} diff --git a/tests/testthat/test-bounds_table.R b/tests/testthat/test-bounds_table.R new file mode 100644 index 0000000..7790528 --- /dev/null +++ b/tests/testthat/test-bounds_table.R @@ -0,0 +1,98 @@ +independent_args <- list(param1 = 0.1, param2 = 0.2, param3 = 0.3) + +bounds_list <- list( + param1 = c(0, 1), + param3 = c(4, 3), + param2 = c(1, 2), + param4 = c(7, 8) +) + +test_that('Bounds tables can be created', { + # No initial values + expect_silent( + bounds_table( + independent_args, + bounds_list + ) + ) + + # Initial values + bounds <- expect_silent( + bounds_table( + independent_args, + bounds_list, + c(0.5, 1.5, 3.5) + ) + ) + + expect_equal( + bounds[['arg_name']], + c('param1', 'param2', 'param3') + ) + + expect_equal( + bounds[['lower']], + c(0, 1, 3) + ) + + expect_equal( + bounds[['upper']], + c(1, 2, 4) + ) +}) + +test_that('Values outside the bounds are detected', { + expect_error( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.5, 3.5) + ), + 'The initial values for the following arguments lie outside the bounds: param2' + ) +}) + +test_that('Values on the bounds are detected', { + expect_warning( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.0, 3.5) + ), + 'The initial values for the following arguments lie on the bounds, which can be problematic for some optimizers: param2' + ) +}) + +test_that('Missing bounds are detected', { + expect_error( + bounds_table( + independent_args, + 1.0 + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + list(1, 2, 3) + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + bounds_list[1:2] + ), + 'The following elements were included in `independent_args` but not `bounds_list`: param2' + ) + + expect_error( + bounds_table( + independent_args, + within(bounds_list, {param1 = 1.0}) + ), + 'The following elements of `bounds_list` do not have a length of 2: param1' + ) +}) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 5be249e..dd48bbc 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -557,38 +557,43 @@ in the past: There are many possible ways to specify the bounds in R, but ultimately they must be expressed as numeric vectors, where the elements are ordered as they are -in `independent_args`. Here we supply lower and upper bounds for each parameter -in a data frame and then ensure they are properly ordered. Later, the data frame -columns can be passed to the optimizer as the bounds: +in `independent_args`. Here we use the `bounds_table` function from +`BioCroValidation` to create a data frame where the lower and upper bounds are +stored as columns. Later, the columns can be passed to the optimizer. The +`bounds_table` function will also check the initial guess to ensure it lies +within the bounds; for more information about this function, see its help page +by typing `?bounds_table` from an R terminal. ```{r setting_bounds} -# Define a table with the bounds -bounds <- read.table( - textConnection(' - arg_name lower upper - alphaLeaf +0 +50 - alphaStem +0 +50 - alphaShell +0 +50 - alphaSeneLeaf +0 +50 - alphaSeneStem +0 +50 - betaLeaf -50 +0 - betaStem -50 +0 - betaShell -50 +0 - betaSeneLeaf -50 +0 - betaSeneStem -50 +0 - rateSeneLeaf +0 +0.0125 - rateSeneStem +0 +0.005 - mrc_leaf +1e-6 +1e-2 - mrc_root +1e-6 +1e-2 - grc_stem +8e-4 +0.08 - grc_root +0.0025 +0.075 - '), - header = TRUE, - stringsAsFactors = FALSE +# Specify some bounds +aul <- 50 # Upper limit for alpha parameters +bll <- -50 # Lower limit for beta parameters +mll <- 1e-6 # Lower limit for mrc parameters +mul <- 1e-2 # Upper limit for mrc parameters + +# Define a table with the bounds in the same order as `independent_args` +bounds <- bounds_table( + independent_args, + list( + alphaLeaf = c(0, aul), + alphaStem = c(0, aul), + alphaShell = c(0, aul), + alphaSeneLeaf = c(0, aul), + alphaSeneStem = c(0, aul), + betaLeaf = c(bll, 0), + betaStem = c(bll, 0), + betaShell = c(bll, 0), + betaSeneLeaf = c(bll, 0), + betaSeneStem = c(bll, 0), + rateSeneLeaf = c(0, 0.0125), + rateSeneStem = c(0, 0.005), + mrc_leaf = c(mll, mul), + mrc_root = c(mll, mul), + grc_stem = c(8e-4, 0.08), + grc_root = c(0.0025, 0.075) + ), + initial_guess ) - -# Make sure it is ordered properly -bounds <- bounds[match(independent_arg_names, bounds[['arg_name']]), ] ``` ## Running the Optimizer From ffd4f85a2a89dee8111e05cf811385bfa3165c94 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 18:51:00 -0500 Subject: [PATCH 48/54] Simplify vignette a bit --- vignettes/parameterizing_soybean_biocro.Rmd | 182 +++++++------------- 1 file changed, 61 insertions(+), 121 deletions(-) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index dd48bbc..3db1dea 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -83,41 +83,11 @@ pre-processing to get it ready. One issue is that the data set specifies the doy of year (DOY) for each harvest, but we need to specify the time using BioCro's convention (the number of hours -since the start of the year). This can be addressed by defining a helper -function that adds a new `time` column following BioCro's convention: - -```{r convert_time} -# Define a helping function for adding a `time` column -convert_time <- function(data_table) { - within(data_table, { - # Define new `time` column - time = (DOY - 1) * 24.0 - }) -} -``` +since the start of the year). Another issue is that the data set includes pod and seed values, but Soybean-BioCro calculates shell and seed masses, where the shell and seed -together comprise the pod. This can also be addressed with helper functions, -which will be different for biomass values and standard deviations: - -```{r get_shell} -# Define a helping function for calculating shell biomass -add_shell_biomass <- function(data_table) { - within(data_table, { - # The shell is all parts of the pod other than the seed - Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha - }) -} - -# Define a helping function for calculating shell biomass standard deviation -add_shell_stdev <- function(data_table) { - within(data_table, { - # Add uncertainties in quadrature, a simple approach to error propagation - Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) - }) -} -``` +together comprise the pod. Although the observations do not include root biomass, it is nevertheless important to constrain the predicted root mass to reasonable values. To do this, @@ -135,82 +105,60 @@ of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See the documentation page (`?objective_function`) for more information about this weighting method. -The root mass and its standard deviation can be added with another set of helper -functions. Note that any observations set to `NA` will be ignored when -calculating the error metric. - -```{r get_root} -# Define a helping function for adding the root mass -add_root_biomass <- function(data_table) { - # Initialize all values to NA - data_table$Root_Mg_per_ha <- NA +Finally, the data set includes some values that are not needed for the +parameterization. This includes the leaf litter accumulated between each +harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been +superseded by other columns defined above. - # Estimate a mass at one time point - row_to_use <- 5 +Here we will define a helping function that can accomplish the required +modifications described above; note that some operations are different depending +on whether the table represents biomass values or standard deviations: - col_to_add <- c( - 'Leaf_Mg_per_ha', - 'Stem_Mg_per_ha', - 'Rep_Mg_per_ha' - ) +```{r process_tables} +# Define a helping function for processing data tables +process_table <- function(data_table, type) { + # Define new `time` column + data_table$time <- (data_table$DOY - 1) * 24.0 - data_table[row_to_use, 'Root_Mg_per_ha'] <- - 0.17 * sum(data_table[row_to_use, col_to_add]) + # Define new `Shell_Mg_per_ha` column + data_table$Shell_Mg_per_ha <- if (type == 'biomass') { + # The shell is all parts of the pod other than the seed + data_table$Rep_Mg_per_ha - data_table$Seed_Mg_per_ha + } else { + # Add uncertainties in quadrature, a simple approach to error propagation + sqrt(data_table$Rep_Mg_per_ha^2 + data_table$Seed_Mg_per_ha^2) + } - data_table -} + # Define new `Root_Mg_per_ha` column + if (type == 'biomass') { + # Initialize all values to NA + data_table$Root_Mg_per_ha <- NA -# Define a helping function for adding the root standard deviation -add_root_stdev <- function(data_table) { - # We can set a value for each time point; any time points where the root mass - # is NA will be ignored - data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + # Estimate a mass at one time point + row_to_use <- 5 - data_table -} -``` + col_to_add <- c( + 'Leaf_Mg_per_ha', + 'Stem_Mg_per_ha', + 'Rep_Mg_per_ha' + ) -Finally, the data set includes some values that are not needed for the -parameterization. This includes the leaf litter accumulated between each -harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been -superseded by other columns defined above. These can be removed with a final -helper function: - -```{r remove_columns} -# Define a helping function for removing unneeded columns -remove_extra_columns <- function(data_table) { - within(data_table, { - # Remove columns by setting them to NULL - DOY = NULL - Rep_Mg_per_ha = NULL - Litter_Mg_per_ha = NULL - }) -} -``` + data_table[row_to_use, 'Root_Mg_per_ha'] <- + 0.17 * sum(data_table[row_to_use, col_to_add]) + } else { + # We can set a value for each time point; any time points where the root + # mass is NA will be ignored + data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + } -Now we can apply these to each table in the set: + # Remove columns by setting them to NULL + data_table$DOY = NULL + data_table$Rep_Mg_per_ha = NULL + data_table$Litter_Mg_per_ha = NULL -```{r process_tables} -# Process the data sets (biomass and stdev from 2002 and 2005) -ambient_2002_biomass <- - remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( - soyface_biomass[['ambient_2002']] - )))) - -ambient_2005_biomass <- - remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( - soyface_biomass[['ambient_2005']] - )))) - -ambient_2002_stdev <- - remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( - soyface_biomass[['ambient_2002_std']] - )))) - -ambient_2005_stdev <- - remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( - soyface_biomass[['ambient_2005_std']] - )))) + # Return the processed table + data_table +} ``` ## The Data-Driver Pairs @@ -224,14 +172,14 @@ observed biomass, and the weight to assign to each year: # Define the data-driver pairs data_driver_pairs <- list( ambient_2002 = list( - data = ambient_2002_biomass, - data_stdev = ambient_2002_stdev, + data = process_table(soyface_biomass[['ambient_2002']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']], 'stdev'), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = ambient_2005_biomass, - data_stdev = ambient_2005_stdev, + data = process_table(soyface_biomass[['ambient_2005']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']], 'stdev'), drivers = BioCro::soybean_weather[['2005']], weight = 1 ) @@ -691,7 +639,7 @@ percent larger than with the default parameter values. The optimized parameter values could likely be improved by using a more stringent tolerance for the optimizer, but this would require more time to run. -# Comparing Model Outputs +## Comparing Model Outputs Another way to evaluate the results of the optimization is to compare simulations using the default, perturbed, and re-optimized versions of the @@ -767,10 +715,10 @@ xyplot( ) ``` -Here we can see that while the simulated stem, root, and grain (seed) biomass -values do not differ much between the models, there are large differences in the -leaf mass, where the default and re-optimized versions are similar and the -perturbed version predicts much smaller values. +Here we can see that while the simulated values for some tissues do not differ +much between the models, there are large differences in other tissues; for these +cases, the default and re-optimized versions are similar and the perturbed +version is much different. ## Saving the New Model Definition @@ -820,19 +768,7 @@ writeLines(r_cmd_string, './soybean_reparam.R') <> ### -### Define helping functions -### - -<> - -<> - -<> - -<> - -### -### Prepare inputs for `objective_function` and create an objective function +### Prepare inputs for `objective_function` ### <> @@ -853,6 +789,10 @@ writeLines(r_cmd_string, './soybean_reparam.R') <> +### +### Create the objective function +### + <> ### From fbeedeb7d58f91fca14e56ab16f06c3d77e412db Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 19:06:19 -0500 Subject: [PATCH 49/54] A little more error checking --- R/objective_function_input_checks.R | 13 ++++++++- tests/testthat/test-objective_function.R | 34 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index dd37fc9..7a7a5a0 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -245,8 +245,19 @@ check_runner_results <- function( stop(msg) } + # Make sure each runner produces a data frame + is_df <- sapply(initial_runner_res, is.data.frame) + + if (any(!is_df)) { + msg <- paste( + 'Some runners did not produce data frames:', + paste(names(initial_runner_res)[!is_df], collapse = ', ') + ) + stop(msg) + } + # Make sure each runner produces the necessary columns in its output - expected_columns <- as.character(full_data_definitions) + expected_columns <- c('time', as.character(full_data_definitions)) missing_columns <- lapply(initial_runner_res, function(res) { expected_columns[!expected_columns %in% colnames(res)] diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 91bdf60..6fee66a 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -410,6 +410,40 @@ test_that('Bad return values are detected', { ), 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) + + # A post-processing function removes the `time` column + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, { + Pod = Grain + Shell + time = NULL + })}, + verbose_startup = verbose_startup + ), + 'Some data columns were missing from runner outputs: +ambient_2002: time +ambient_2005: time', + fixed = TRUE + ) + + # A post-processing function doesn't return a data frame + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {1.0}, + verbose_startup = verbose_startup + ), + 'Some runners did not produce data frames: ambient_2002, ambient_2005' + ) }) test_that('Bad regularization methods are detected', { From d89fe452933c2d02234b544643bea743dd2ae1e4 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 19:13:15 -0500 Subject: [PATCH 50/54] Fix silly mistake in DESCRIPTION --- DESCRIPTION | 3 --- 1 file changed, 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4dfa230..720c39c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,9 +8,6 @@ Authors@R: c( comment = c(ORCID = "0000-0002-4912-9783")), person("BioCroField authors", role = "cph") ) -License: MIT + file LICENSE -Encoding: UTF-8 -LazyData: true Depends: R (>= 3.6.0) Imports: From 382678f81e0cbce7528617773ff5d4934e8c3976 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 09:59:31 -0500 Subject: [PATCH 51/54] Add more info about `bounds_table` --- DESCRIPTION | 1 + man/bounds_table.Rd | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 720c39c..78e5225 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Imports: BioCro (>= 3.2.0) Suggests: dfoptim, + DEoptim, lattice, knitr, rmarkdown, diff --git a/man/bounds_table.Rd b/man/bounds_table.Rd index b833c59..45cd68a 100644 --- a/man/bounds_table.Rd +++ b/man/bounds_table.Rd @@ -56,6 +56,10 @@ bounds; an error will be thrown if any do lie outside the bounds. A warning will also be thrown if any initial guesses lie on the bounds, since this can be problematic for some optimizers, such as \code{\link[dfoptim]{nmkb}}. + + Some optimizers, such as \code{\link[DEoptim]{DEoptim}}, do not require an + initial guess; in this case, there is no strong need to pass an initial guess + to \code{bounds_table}. } \value{ From 43a0e74baaf2a2dc2322842517554b94ac735c7e Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 09:59:57 -0500 Subject: [PATCH 52/54] Allow flexible `epsilon` --- R/objective_function.R | 6 +- R/objective_function_helpers.R | 32 +++++- man/objective_function.Rd | 114 ++++++++++++++++---- tests/testthat/test-objective_function.R | 1 + vignettes/parameterizing_soybean_biocro.Rmd | 28 +++-- 5 files changed, 139 insertions(+), 42 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index 1730e29..6c21e00 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -8,7 +8,9 @@ objective_function <- function( quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + normalization_param = NULL, stdev_weight_method = 'equal', + stdev_weight_param = NULL, regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, @@ -73,13 +75,15 @@ objective_function <- function( long_form_data <- add_norm( long_form_data, normalization_method, + normalization_param, length(data_driver_pairs) ) # Add variance-based weights long_form_data <- add_w_var( long_form_data, - stdev_weight_method + stdev_weight_method, + stdev_weight_param ) # Print the long form data, if desired. Do this before checking the data, diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index ce32bef..ae5ef09 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -174,8 +174,23 @@ add_time_indices <- function(initial_runner_res, long_form_data) { long_form_data } +# Helping function for getting a parameter value that has a default +get_param_default <- function(param, default) { + if (is.null(param)) { + default + } else { + param + } +} + # Helping function for getting normalization factors -add_norm <- function(long_form_data, normalization_method, n_ddp) { +add_norm <- function( + long_form_data, + normalization_method, + normalization_param, + n_ddp +) +{ for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] @@ -191,10 +206,17 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { nrow(qname_subset) * n_ddp } else if (tolower(normalization_method) == 'max') { max(qname_subset[['quantity_value']])^2 + } else if (tolower(normalization_method) == 'obs') { + eps <- get_param_default(normalization_param, 1e-1) + data_table[i, 'quantity_value']^2 + eps } else if (tolower(normalization_method) == 'mean_max') { npts <- nrow(qname_subset) qmax <- max(qname_subset[['quantity_value']]) npts * n_ddp * qmax^2 + } else if (tolower(normalization_method) == 'mean_obs') { + npts <- nrow(qname_subset) + eps <- get_param_default(normalization_param, 1e-1) + npts * n_ddp * (data_table[i, 'quantity_value']^2 + eps) } else { stop('Unsupported normalization_method: ', normalization_method) } @@ -207,7 +229,7 @@ add_norm <- function(long_form_data, normalization_method, n_ddp) { } # Helping function for getting variance-based weights -add_w_var <- function(long_form_data, stdev_weight_method) { +add_w_var <- function(long_form_data, stdev_weight_method, stdev_weight_param) { for (i in seq_along(long_form_data)) { data_table <- long_form_data[[i]] data_stdev <- data_table[['quantity_stdev']] @@ -216,9 +238,11 @@ add_w_var <- function(long_form_data, stdev_weight_method) { if (tolower(stdev_weight_method) == 'equal') { 1.0 } else if (tolower(stdev_weight_method) == 'logarithm') { - log(1 / (data_stdev + 1e-5)) + eps <- get_param_default(stdev_weight_param, 1e-5) + log(1 / (data_stdev + eps)) } else if (tolower(stdev_weight_method) == 'inverse') { - 1 / data_stdev^2 + eps <- get_param_default(stdev_weight_param, 1e-1) + 1 / (data_stdev^2 + eps) } else { stop('Unsupported stdev_weight_method: ', stdev_weight_method) } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 4693d9c..2cb1e5a 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -22,6 +22,9 @@ It is also possible to include "dependent" model arguments, whose values are determined from the "independent" model arguments that are varied during the parameterization procedure. + + For a detailed example of using \code{objective_function}, see the + "Parameterizing Soybean-BioCro" vignette/article. } \usage{ @@ -32,7 +35,9 @@ quantity_weights, data_definitions = list(), normalization_method = 'mean_max', + normalization_param = NULL, stdev_weight_method = 'equal', + stdev_weight_param = NULL, regularization_method = 'none', dependent_arg_function = NULL, post_process_function = NULL, @@ -57,7 +62,8 @@ \code{time}, whose values follow BioCro's definition of \code{\link[BioCro]{time}}; the other columns should represent observed values of model outputs. Any \code{NA} values in \code{data} will - be ignored when calculating the error metric. + be ignored when calculating the error metric, but all non-\code{NA} values + of all columns (except \code{time}) will be compared to the model output. The \code{drivers} element must be a data frame that can be passed to \code{\link[BioCro]{run_biocro}} as its \code{drivers} input argument. @@ -75,8 +81,8 @@ \item{independent_args}{ A list of named numeric values. The names will determine the independent arguments to be varied during their optimization, and the values specify - "test" values of each argument that will be used internally to check that - the objective function is properly defined and can be evaluated. + "initial" or "test" values of each argument that will be used internally to + check that the objective function is properly defined and can be evaluated. } \item{quantity_weights}{ @@ -101,11 +107,25 @@ error metric; see below for more details. } + \item{normalization_param}{ + An (optional) parameter value used by some normalization methods. When + \code{normalization_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + \item{stdev_weight_method}{ A string indicating the method to be used when calculating the variance-based weights used in the error metric; see below for more details. } + \item{stdev_weight_param}{ + An (optional) parameter value used by some normalization methods. When + \code{stdev_weight_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + \item{regularization_method}{ A string indicating the regularization method to be used when calculating the regularization penalty term; see below for more details. @@ -327,18 +347,33 @@ as \deqn{w_i^{stdev} =% - ln \left( \frac{1}{\sigma + 10^{-5}} \right),} - where \eqn{ln} denotes a logarithm with base \eqn{e}. This method - was used in the original Soybean-BioCro paper. Note: this method - should be used with caution, because \eqn{w_i^{stdev}} is zero - for \eqn{\sigma = 1 - 10^{-5} = 0.99999}, and it becomes negative for - larger values of \eqn{\sigma}. + ln \left( \frac{1}{\sigma_i + \epsilon} \right),} + where \eqn{ln} denotes a logarithm with base \eqn{e} and + \eqn{\epsilon} is a small number included to prevent numerical errors + that would otherwise occur when \eqn{\sigma = 0}. This method was + used in the original Soybean-BioCro paper. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-5} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} \approx 11.512} when \eqn{\sigma = 0}. + + Note: this method should be used with caution, because + \eqn{w_i^{stdev}} is zero for \eqn{\sigma_i = 1 - \epsilon}, and it + becomes negative for larger values of \eqn{\sigma_i}. \item \code{'inverse'}: For this method, \eqn{w_i^{stdev}} is calculated as - \deqn{w_i^{stdev} = \frac{1}{\sigma^2}.} - Note: this method should be used with caution, because - \eqn{w_i^{stdev}} is infinite when \eqn{\sigma} is zero. + \deqn{w_i^{stdev} = \frac{1}{\sigma_i^2 + \epsilon},} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{\sigma_i = 0}. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-1} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} = 10} when \eqn{\sigma_i = 0}. } If any values of \eqn{w_i^{stdev}} are undefined, negative, or infinite, an @@ -377,6 +412,23 @@ method. This approach avoids over-representing variable types with larger magnitude when determining \eqn{E_{agreement}}. + \item \code{'obs'}: For this method, + + \deqn{N_i = \left( Y_{obs}^i \right)^2 + \epsilon,} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{Y_{obs}^i = 0}. In this + case, the equation for \eqn{E_{agreement}} essentially uses relative + differences rather than absolute differences, which is achieved by + normalizing by the observed values, hence the name. This approach + avoids over-representing time points where a particular quantity takes + its largest values when determining \eqn{E_{agreement}}. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, \eqn{N_i = 10} + when \eqn{Y_{obs}^i = 0}. + \item \code{'mean_max'}: For this method, the "mean" and "max" methods are combined so that @@ -384,10 +436,27 @@ \cdot n_{data} \cdot \left( max_{vtype}^{vdata} \right)^2.} This approach avoids over-representing drivers with larger numbers of associated observations, and variable types with larger magnitudes. - This method is used for parameterizing Soybean-BioCro, and is - recommended for most situations. + This method is used for parameterizing Soybean-BioCro. + + \item \code{'mean_obs'}: For this method, the "mean" and "obs" methods are + combined so that + + \deqn{N_i = n_{vtype}^{vdrivers} \cdot n_{data}% + \cdot \left( \left( Y_{obs}^i \right)^2 + \epsilon \right)^2.} + This approach avoids over-representing drivers with larger numbers of + associated observations, and time points with larger observed values. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{N_i = 10 \cdot n_{vtype}^{vdrivers} \cdot n_{data}} when + \eqn{Y_{obs}^i = 0}. } + In most situations, it is recommended to use either \code{'mean_max'} or + \code{'mean_obs'} depending on user preference or performance. + \strong{Regularization methods} The following regularization methods are available: @@ -437,11 +506,12 @@ \code{post_process_function}. \item Ensuring that each independent and dependent argument name is either - a parameter or initial value of the model. This is accomplished using - \code{\link[BioCro]{partial_run_biocro}}. Note that argument names - passed to \code{partial_run_biocro} can technically include drivers, - but it is unlikely that the value of a driver would be varied during - an optimization, so the argument names are not allowed to include + a parameter or initial value of the model. Internally, argument names + are passed to \code{\link[BioCro]{partial_run_biocro}} via its + \code{arg_names} input. Note that argument names passed to + \code{partial_run_biocro} can technically include drivers, but it is + unlikely that the value of a driver would be varied during an + optimization, so the argument names are not allowed to include columns in the drivers. \item Ensuring that the optional \code{dependent_arg_function}, @@ -449,9 +519,9 @@ functions can be run without causing errors. \item Ensuring that certain values are finite (such as \eqn{Y_{obs}}, - \eqn{\sigma}, \eqn{w_i^{stdev}}, and \eqn{N}), and that certain values - are not negative (such as \eqn{\sigma}, \eqn{w_i^{stdev}}, and - \eqn{N}). + \eqn{\sigma_i}, \eqn{w_i^{stdev}}, and \eqn{N_i}), and that certain + values are not negative (such as \eqn{\sigma_i}, \eqn{w_i^{stdev}}, + and \eqn{N_i}). } If any issues are detected, an informative error message will be sent. Note diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 6fee66a..dcfccf1 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -473,6 +473,7 @@ test_that('Bad data values and weights are detected', { independent_args, quantity_weights, stdev_weight_method = 'inverse', + stdev_weight_param = 0, data_definitions = data_definitions, post_process_function = post_process_function, verbose_startup = verbose_startup diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 3db1dea..651894a 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -100,10 +100,10 @@ added to the observed values. In previous parameterizations, a standard deviation for the root mass was not explicitly estimated; instead, the standard-deviation-based weight factor was -implicitly set to 1. Because the `'logarithm'` method was used, a weight factor -of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See -the documentation page (`?objective_function`) for more information about this -weighting method. +implicitly set to 1. Because the `'logarithm'` method with +\(\epsilon = 10^{-5}\) was used, a weight factor of 1 implies a standard +deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See the documentation page +(`?objective_function`) for more information about this weighting method. Finally, the data set includes some values that are not needed for the parameterization. This includes the leaf litter accumulated between each @@ -129,14 +129,12 @@ process_table <- function(data_table, type) { sqrt(data_table$Rep_Mg_per_ha^2 + data_table$Seed_Mg_per_ha^2) } - # Define new `Root_Mg_per_ha` column - if (type == 'biomass') { - # Initialize all values to NA - data_table$Root_Mg_per_ha <- NA + # Define new `Root_Mg_per_ha` column, which has just one non-NA value + row_to_use <- 5 # Choose row to use + data_table$Root_Mg_per_ha <- NA # Initialize all values to NA + if (type == 'biomass') { # Estimate a mass at one time point - row_to_use <- 5 - col_to_add <- c( 'Leaf_Mg_per_ha', 'Stem_Mg_per_ha', @@ -146,9 +144,8 @@ process_table <- function(data_table, type) { data_table[row_to_use, 'Root_Mg_per_ha'] <- 0.17 * sum(data_table[row_to_use, col_to_add]) } else { - # We can set a value for each time point; any time points where the root - # mass is NA will be ignored - data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + # Estimate standard deviation at one time point + data_table[row_to_use, 'Root_Mg_per_ha'] <- 1 / exp(1) - 1e-5 } # Remove columns by setting them to NULL @@ -397,8 +394,8 @@ more details to discuss: for more details. - Soybean-BioCro has always used the `'logarithm'` method for determining - weights from standard deviations; see Equation 17 of - @matthews_soybean_biocro_2021 for more details. + weights from standard deviations with \(\epsilon = 10^{-5}\); see Equation 17 + of @matthews_soybean_biocro_2021 for more details. - Soybean-BioCro has not used any regularization. @@ -416,6 +413,7 @@ obj_fun <- objective_function( data_definitions = data_definitions, normalization_method = 'mean_max', stdev_weight_method = 'logarithm', + stdev_weight_param = 1e-5, regularization_method = 'none', dependent_arg_function = dependent_arg_function, post_process_function = post_process_function, From 4b3e3490cc9fa3aa4672a2824272dcaed613d951 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 15:46:09 -0500 Subject: [PATCH 53/54] Add "Getting Started" vignette --- NEWS.md | 7 ++- README.md | 9 ++++ vignettes/BioCroValidation.Rmd | 92 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 vignettes/BioCroValidation.Rmd diff --git a/NEWS.md b/NEWS.md index ab5e3fa..350c164 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,14 +38,17 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. - Added several new functions: `objective_function`, `update_model`, and - `bounds_table` + `bounds_table`. -- Added a vignette illustrating how to perform a model parameterization. +- Added two new vignettes: a "Getting Started" article (`BioCroValidation.Rmd`) + and a user guide illustrating how to perform a model parameterization + (`parameterizing_soybean_biocro.Rmd`). # Changes in BioCroValidation Version 0.1.0 - This is the first version of BioCroValidation. At this point, the package is in a state of rapid development, and not all changes will be described here. + - We are reserving version `1.0.0` for a more stable and complete future release; until then, major changes should only increase the minor version number. diff --git a/README.md b/README.md index 2b2c91c..f6eeee0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,15 @@ remotes::install_github('BioCro/BioCroValidation') Note that this method requires the `remotes` package, which can be installed from within R by typing `install.packages('remotes')`. +### Usage + +The best way to learn about using `BioCroValidation` is to visit the +[BioCroValidation website](https://biocro.github.io/BioCroValidation/index.html) +and click the "Get Started" link in the top menu bar. The website includes +documentation for all the functions and data sets included in the package, as +well as articles that describe its general features and several important use +cases. + ### License The `BioCroValidation` R package and its documentation are licensed under the diff --git a/vignettes/BioCroValidation.Rmd b/vignettes/BioCroValidation.Rmd new file mode 100644 index 0000000..4982053 --- /dev/null +++ b/vignettes/BioCroValidation.Rmd @@ -0,0 +1,92 @@ +--- +title: "Getting Started With BioCroValidation" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Getting Started With BioCroValidation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 6, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +**BioCroValidation** is an R package that provides a suite of tools for +validating BioCro crop growth models. + +Validation is a key part of using and developing BioCro models. The goal of this +package is to provide some convenient "helping" functions to aid with various +aspects of model validation, especially parameterization. + +The central tool in the package is the `objective_function` function. As its +name implies, it can be used to create an objective function that calculates the +value of an error metric value given a set of model parameter values. In turn, +the objective function itself can be passed to an optimizer. + +While it is possible for each BioCro user to write their own customized +objective function, creating one can be a very complex process because there are +many potential aspects to consider: + +- **Mathematical / Statistical Approach:** This refers to choices like "Do I + want to normalize the error terms by each observed value, or by the largest + observed value for each measured quantity?" or "Do I want to use L1 or L2 + regularization?" + +- **Implementation of Mathematical / Statistical Approach:** Once an approach + has been identified, code must be written to properly implement it. + +- **Error Checks:** A wide variety of strange conditions can occur during + parameterization, and the objective function must be ready to handle them. For + example, how should the objective function respond when a simulation does not + run to completion, or when an optimizer passes `NA` as a parameter value? + +- **Technical Details:** Parameterization can take a long time to perform, so it + is important for the objective function code to be as efficient and fast as + possible. + +The goal of `objective_function` is to allow users to make the key choices about +their mathematical approach using clear code statements like +`regularization_method = 'L2'`, while the implementation details, error checks, +and other technical details are handled internally. This will result in clear +scripts that are also more reliable. + +Besides `objective_function`, the package also includes a few other functions +with a similar goal of clarifying code and hiding implementation details, such +as `bounds_table` and `update_model`. + +# Installing BioCroValidation + +The easiest way to install `BioCroValidation` is to type the following from +within an R terminal: + +```{r, eval = FALSE} +remotes::install_github('biocro/BioCroValidation') +``` + +Note that this method requires the `remotes` package, which can be installed +from within R by typing `install.packages('remotes')`. + +# Learning More + +The `BioCroValidation` package includes extensive documentation. The best place +to start is the +[Parameterizing Soybean-BioCro](parameterizing_soybean_biocro.html) +article, which illustrates the full process of defining an objective function, +running an optimization, examining the results, and saving the new model +definition. + +Another key resource is the help page for `objective_function`, which can be +accessed online or by typing `?objective_function` in an R terminal. This +document explains all the available options for normalization approaches, +regularization approaches, and other aspects of defining an objective function. From 333d9015a9c92e524ae948f91b216a6465ed8615 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Fri, 23 May 2025 16:04:02 -0500 Subject: [PATCH 54/54] Update version information --- DESCRIPTION | 6 +++--- LICENSE | 2 +- LICENSE.md | 2 +- NEWS.md | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 78e5225..0e388e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: BioCroValidation -Version: 0.1.0 +Version: 0.2.0 Title: Tools for Validating BioCro Models Description: A collection of tools for validating BioCro crop growth models. Authors@R: c( person("Edward B.", "Lochocki", role = c('cre', 'aut'), email = "eloch@illinois.edu", comment = c(ORCID = "0000-0002-4912-9783")), - person("BioCroField authors", role = "cph") + person("BioCroValidation authors", role = "cph") ) Depends: R (>= 3.6.0) @@ -23,5 +23,5 @@ VignetteBuilder: knitr License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ +URL: https://github.com/BioCro/BioCroValidation, https://biocro.github.io/BioCroValidation/ Config/testthat/edition: 3 diff --git a/LICENSE b/LICENSE index 2826856..5d895d6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ YEAR: 2025 -COPYRIGHT HOLDER: BioCroField authors +COPYRIGHT HOLDER: BioCroValidation authors diff --git a/LICENSE.md b/LICENSE.md index e13eb07..190d2df 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2025 BioCroField authors +Copyright (c) 2025 BioCroValidation authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/NEWS.md b/NEWS.md index 350c164..45cd934 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,9 +33,9 @@ In the case of a hotfix, a short section headed by the new release number should be directly added to this file to describe the related changes. --> -# UNRELEASED +# Changes in BioCroValidation Version 0.2.0 (2025-05-23) -- Added 2002 and 2005 SoyFACE biomass data. +- Added 2002 and 2005 SoyFACE biomass and standard deviation data. - Added several new functions: `objective_function`, `update_model`, and `bounds_table`.