From b6a5889d4d54fd8220c6ccd83c02778260b6772f Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 12 May 2025 14:26:22 -0500 Subject: [PATCH 01/13] Include full biomass data sets --- data/soyface_biomass.R | 84 +++++++++++------------- man/objective_function.Rd | 26 +++++--- man/soyface_biomass.Rd | 19 +++++- tests/testthat/test-objective_function.R | 19 +++--- 4 files changed, 85 insertions(+), 63 deletions(-) diff --git a/data/soyface_biomass.R b/data/soyface_biomass.R index bb4e1b9..dc36357 100644 --- a/data/soyface_biomass.R +++ b/data/soyface_biomass.R @@ -1,58 +1,54 @@ soyface_biomass <- list( - ambient_2002 = utils::read.table( + ambient_2002 = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 179 0.180284339 0.085244969 0 - 189 0.554461942 0.418853893 0 - 203 1.326552931 1.711067366 0.000317148 - 217 1.697944007 2.892825897 0.079396325 - 231 1.807742782 3.68591426 1.554571303 - 246 1.578813648 3.745260717 3.97601356 - 259 0.947537773 3.618401575 6.544650656 - 288 0 2.305701225 7.008967628 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.180284339443109,0.085244969371991,0,0,0,0 + 189,0.55446194221275,0.41885389322975,0,0,0,0 + 203,1.32655293077725,1.71106736644195,0.0003171478564925,0,0,0 + 217,1.69794400686295,2.8928258965309,0.07939632545295,0,0,0 + 231,1.80774278200725,3.6859142604218,1.55457130346238,0,0.3818897637489,0.3818897637489 + 246,1.57881364816734,3.74526071710995,3.97601356048602,2.480314960431,0.281423009601227,0.663312773350127 + 259,0.947537773327332,3.61840157451295,6.5446506556431,4.846894137844,0.315325021846977,0.978637795197104 + 288,0,2.30570122466198,7.00896762848425,5.4133858263375,1.60453886688794,2.58317666208504'), header = TRUE ), - ambient_2005 = utils::read.table( + ambient_2005 = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 172 0.222271875 0.188803125 0 - 186 0.8460375 0.85220625 0 - 200 1.184465625 1.61896875 0 - 214 2.218059375 4.043615625 0.29925 - 228 2.147446875 4.477725 2.304553125 - 242 1.51948125 3.8920875 5.532778125 - 256 0.06575625 2.89905 5.371078125 - 270 0 2.1756 6.372253125 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.222271875,0.188803125,0,0,0,0 + 186,0.8460375,0.85220625,0,0,0,0 + 200,1.184465625,1.61896875,0,0,0,0 + 214,2.218059375,4.043615625,0.29925,0,0.041475,0.06654375 + 228,2.147446875,4.477725,2.304553125,0,0.153628125,0.18230625 + 242,1.51948125,3.8920875,5.532778125,3.022490625,0.1157625,0.335934375 + 256,0.06575625,2.89905,5.371078125,3.998203125,0.5310375,0.866971875 + 270,0,2.1756,6.372253125,4.965646875,0.281465625,1.1484375'), header = TRUE ), - ambient_2002_std = utils::read.table( + ambient_2002_std = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 179 0.04081555 0.017079737 0 - 189 0.163863274 0.138449025 0 - 203 0.133774434 0.183710759 0.000549316 - 217 0.228326658 0.448744065 0.030989998 - 231 0.202475421 0.453447471 0.218402544 - 246 0.075475165 0.275321356 0.573548828 - 259 0.344550033 0.151045378 0.743440701 - 288 0 0.148389261 0.141828617 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.040815550055146,0.0170797372473185,0,0,0,0 + 189,0.163863273875505,0.138449024803329,0,0,0,0 + 203,0.133774433506304,0.18371075937777,0.000549316200956573,0,0,0 + 217,0.228326657644473,0.448744065190288,0.0309899984999309,0,0,0 + 231,0.202475421469591,0.453447470747275,0.218402543503905,0,0.0456182274998721,0.0456182274998721 + 246,0.0754751653831798,0.275321356095752,0.573548828062959,0.36605836251352,0.0724761221542088,0.0676627659537074 + 259,0.344550032518756,0.151045377683595,0.743440701063986,0.514421560164731,0.0495275235053204,0.0320523860205109 + 288,0,0.148389260861146,0.141828616948454,0.115260823492553,0.275546039297454,0.267829516105734'), header = TRUE ), - ambient_2005_std = utils::read.table( + ambient_2005_std = utils::read.csv( textConnection(' - DOY Leaf_Mg_per_ha Stem_Mg_per_ha Pod_Mg_per_ha - 172 0.032896589 0.014318136 0 - 186 0.146798299 0.198830061 0 - 200 0.338074288 0.605286253 0 - 214 0.152175913 0.559874052 0.164275197 - 228 0.119077589 0.306744644 0.434148074 - 242 0.512808699 0.379108485 0.588476976 - 256 0.061686243 0.220823981 0.52004438 - 270 0 0.243254729 0.633090858 - '), + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.0328965891267146,0.0143181358308921,0,0,0,0 + 186,0.146798299121618,0.19883006141002,0,0,0,0 + 200,0.338074287830052,0.605286252689843,0,0,0,0 + 214,0.152175913412888,0.559874052059856,0.16427519712551,0,0.0496298580745502,0.0637084579721366 + 228,0.119077588733867,0.30674464428393,0.434148073671929,0,0.0171152969203539,0.056246865537668 + 242,0.512808698911872,0.379108485359897,0.588476975732098,0.341714781741362,0.0100661134816274,0.073342888722574 + 256,0.0616862433382638,0.220823980737487,0.520044379820051,0.398956753155868,0.179630286326179,0.214176625850131 + 270,0,0.243254729045758,0.633090857876002,0.507224090206239,0.0520489826201903,0.246267463689126'), header = TRUE ) ) diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 5d93d39..93c6dfe 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -505,11 +505,19 @@ if (require(BioCro)) { # in two data tables; it also includes the standard deviations of the measured # biomasses, which are included in two separate tables. However, these data # tables each have a `DOY` column rather than a `time` column, so we need to - # alter them. Here we will do this by defining a short helper function. - convert_time <- function(x) { + # alter them. The tables also include other columns we do not wish to use in + # this example. So, we will define a short helper function that can be used to + # pre-process each table. + process_table <- function(x) { within(x, { - time = (DOY - 1) * 24.0 # Define new `time` column - DOY = NULL # Remove unneeded `DOY` column + # Define new `time` column + time = (DOY - 1) * 24.0 + + # Remove unneeded columns + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL }) } @@ -518,14 +526,14 @@ if (require(BioCro)) { # heavily as the 2002 data. data_driver_pairs <- list( ambient_2002 = list( - data = convert_time(soyface_biomass[['ambient_2002']]), - data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = convert_time(soyface_biomass[['ambient_2005']]), - data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]), + data = process_table(soyface_biomass[['ambient_2005']]), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']]), drivers = BioCro::soybean_weather[['2005']], weight = 2 ) @@ -538,7 +546,7 @@ if (require(BioCro)) { data_definitions <- list( Leaf_Mg_per_ha = 'Leaf', Stem_Mg_per_ha = 'Stem', - Pod_Mg_per_ha = 'Pod' + Rep_Mg_per_ha = 'Pod' ) # The data contains values of pod mass, but the model does not calculate pod diff --git a/man/soyface_biomass.Rd b/man/soyface_biomass.Rd index adf073e..e8a0dd7 100644 --- a/man/soyface_biomass.Rd +++ b/man/soyface_biomass.Rd @@ -20,7 +20,12 @@ \item \code{DOY}: The day of year \item \code{Leaf_Mg_per_ha}: Leaf biomass per area expressed in Mg / ha \item \code{Stem_Mg_per_ha}: Stem biomass per area expressed in Mg / ha - \item \code{Pod_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + \item \code{Rep_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + \item \code{Seed_Mg_per_ha}: Seed biomass per area expressed in Mg / ha + \item \code{Litter_Mg_per_ha}: Mass of leaf litter accumulated between + harvests, expressed in Mg / ha + \item \code{CumLitter_Mg_per_ha}: Cumulative leaf litter biomass per aear + expressed in Mg / ha } The elements named \code{ambient_2002} and \code{ambient_2005} represent mean @@ -33,7 +38,7 @@ } \source{ - This data was obtained from several files available from the + The leaf, stem, and pod data was obtained from several files in from the \href{https://github.com/cropsinsilico/soybean-biocro}{Soybean-BioCro GitHub repository}: \itemize{ \item \code{Data/SoyFACE_data/2002_ambient_biomass.csv} @@ -43,6 +48,16 @@ } See that repository for more information. + + The leaf litter accumulated between harvests was obtained from the original + sources. The cumulative leaf litter was calculated from the amount accumulated + between harvests. The seed mass was estimated as a fraction of the total pod + mass, using proportionality factors determined from unpublished data collected + in Champaign, Illinois during 2021-2022. + + These data tables have not been published previously, but were used to + parameterize Soybean-BioCro as used in He \emph{et al.} 2024 + (\doi{10.1093/insilicoplants/diae009}). } \keyword{datasets} diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 29202bb..d0fe2c3 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -2,22 +2,25 @@ model <- BioCro::soybean model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] -convert_time <- function(x) { +process_table <- function(x) { within(x, { - time = (DOY - 1) * 24.0 - DOY = NULL + time = (DOY - 1) * 24.0 + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL }) } ddps <- list( ambient_2002 = list( - data = convert_time(soyface_biomass[['ambient_2002']]), - data_stdev = convert_time(soyface_biomass[['ambient_2002_std']]), + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = convert_time(soyface_biomass[['ambient_2005']]), + data = process_table(soyface_biomass[['ambient_2005']]), drivers = BioCro::soybean_weather[['2005']], weight = 2 ) @@ -30,7 +33,7 @@ independent_args <- with(BioCro::soybean[['parameters']], { data_definitions <- list( Leaf_Mg_per_ha = 'Leaf', Stem_Mg_per_ha = 'Stem', - Pod_Mg_per_ha = 'Pod' + Rep_Mg_per_ha = 'Pod' ) dependent_arg_function <- function(x) { @@ -415,7 +418,7 @@ test_that('Bad data values and weights are detected', { objective_function( model, within(ddps, { - ambient_2005$data_stdev = convert_time(soyface_biomass[['ambient_2005_std']]) + ambient_2005$data_stdev = process_table(soyface_biomass[['ambient_2005_std']]) ambient_2005$data_stdev[['Leaf_Mg_per_ha']] <- -0.1 }), independent_args, From 1ae0e1ca11809b2be04ce031527fd76754ca5882 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Mon, 12 May 2025 22:42:15 -0500 Subject: [PATCH 02/13] Start adding vignette --- .Rbuildignore | 3 + DESCRIPTION | 6 + vignettes/.gitignore | 2 + vignettes/parameterizing_soybean_biocro.Rmd | 204 ++++++++++++++++++++ vignettes/references.bib | 46 +++++ 5 files changed, 261 insertions(+) create mode 100644 vignettes/.gitignore create mode 100644 vignettes/parameterizing_soybean_biocro.Rmd create mode 100644 vignettes/references.bib diff --git a/.Rbuildignore b/.Rbuildignore index 3c48ecc..69c5505 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,6 +4,9 @@ # reason, there should not be any blank comment lines, because they would # exclude any files with `#` in the filename, possibly causing confusing issues. +^Rhistory$ +# History files + ^.*\.Rproj$ # Designates the directory as an RStudio Project. diff --git a/DESCRIPTION b/DESCRIPTION index 3a85ad4..72f0f01 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,12 @@ Depends: Imports: BioCro (>= 3.2.0) Suggests: + knitr, + rmarkdown, testthat (>= 3.0.0) +VignetteBuilder: knitr +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: true URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ Config/testthat/edition: 3 diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd new file mode 100644 index 0000000..b746a29 --- /dev/null +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -0,0 +1,204 @@ +--- +title: "Parameterizing Soybean-BioCro" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Parameterizing Soybean-BioCro} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +bibliography: references.bib +link-citations: yes +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7.5, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +This article shows how to create an objective function that can be used to +parameterize BioCro's soybean model +[@matthews_soybean_biocro_2021; @lochocki_biocro_2022]. + +Since the original publication of Soybean-BioCro, the BioCro module library has +undergone several changes, and the model has been re-parameterized several +times. These parameterizations have not used `BioCroValidation`, since they were +performed before `BioCroValidation` was created. + +Here, we re-create the objective function that was used for the parameterization +included in version `3.2.0` of the BioCro R package. + +In the commands below, we will use functions from several libraries, so we will +load them now: + +```{r libraries} +# Load required libraries +library(BioCroValidation) +library(BioCro) +``` + +# Building the Objective Function + +In this section, we will use the `objective_function` function from +`BioCroValidation` package to create an objective function that can be used to +parameterize Soybean-BioCro. + +## The Observed Data + +The observed data needed to parameterize Soybean-BioCro is included in the +`BioCroValidation` package as the `soyface_biomass` data set, which consists of +two years (2002 and 2005) of biomass data and associated standard deviations, +included in four separate tables. However, each table requires some +pre-processing to get it ready. + +One issue is that the data set specifies the doy of year (DOY) for each harvest, +but we need to specify the time using BioCro's convention (the number of hours +since the start of the year). This can be addressed by defining a helper +function that adds a new `time` column following BioCro's convention: + +```{r convert_time} +# Define a helping function for adding a `time` column +convert_time <- function(data_table) { + within(data_table, { + # Define new `time` column + time = (DOY - 1) * 24.0 + }) +} +``` + +Another issue is that the data set includes pod and seed values, but +Soybean-BioCro calculates shell and seed masses, where the shell and seed +together comprise the pod. This can also be addressed with helper functions, +which will be different for biomass values and standard deviations: + +```{r get_shell} +# Define a helping function for calculating shell biomass +add_shell_biomass <- function(data_table) { + within(data_table, { + Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha + }) +} + +# Define a helping function for calculating shell biomass standard deviation +add_shell_stdev <- function(data_table) { + within(data_table, { + Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) + }) +} +``` + +Finally, the data set includes some values that are not needed for the +parameterization. This includes the leaf litter accumulated between each +harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been +superseded by other columns defined above. These can be removed with a final +helper function: + +```{r remove_columns} +# Define a helping function for removing unneeded columns +remove_extra_columns <- function(data_table) { + within(data_table, { + DOY = NULL + Rep_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + }) +} +``` + +Now we can apply these to each table in the set: + +```{r process_tables} +# Process the data sets +ambient_2002_biomass <- remove_extra_columns( + add_shell_biomass( + convert_time( + soyface_biomass[['ambient_2002']] + ) + ) +) + + +ambient_2005_biomass <- remove_extra_columns( + add_shell_biomass( + convert_time( + soyface_biomass[['ambient_2005']] + ) + ) +) + +ambient_2002_stdev <- remove_extra_columns( + add_shell_stdev( + convert_time( + soyface_biomass[['ambient_2002_std']] + ) + ) +) + + +ambient_2005_stdev <- remove_extra_columns( + add_shell_stdev( + convert_time( + soyface_biomass[['ambient_2005_std']] + ) + ) +) +``` + +## The Data-Driver Pairs + +The `BioCro` R package includes weather data for the years in the +`soyface_biomass` data set. So now we are ready to define the data-driver pairs, +which includes the weather, the observed biomass, the standard deviation of the +observed biomass, and the weight to assign to each year: + +```{r data_driver_pairs} +# Define the data-driver pairs +data_driver_pairs <- list( + ambient_2002 = list( + data = ambient_2002_biomass, + data_stdev = ambient_2002_stdev, + drivers = BioCro::soybean_weather[['2002']], + weight = 1 + ), + ambient_2005 = list( + data = ambient_2005_biomass, + data_stdev = ambient_2005_stdev, + drivers = BioCro::soybean_weather[['2005']], + weight = 1 + ) +) +``` + +Here we have chosen equal weights for the two years. + +# Commands From This Document + +```{r, eval = FALSE} +<> + +### +### Define helping functions +### + +<<>> + +<> + +<> + +### +### Prepare inputs for `objective_function` +### + +<> + +``` + +# References diff --git a/vignettes/references.bib b/vignettes/references.bib new file mode 100644 index 0000000..fec18e3 --- /dev/null +++ b/vignettes/references.bib @@ -0,0 +1,46 @@ + +@article{lochocki_biocro_2022, + author = {Lochocki, Edward B and Rohde, Scott and Jaiswal, Deepak and Matthews, Megan L and Miguez, Fernando and Long, Stephen P and McGrath, Justin M}, + title = {BioCro II: a software package for modular crop growth simulations}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diac003}, + year = {2022}, + month = {02}, + doi = {10.1093/insilicoplants/diac003}, +} + +@article{matthews_soybean_biocro_2021, + author = {Matthews, Megan L and Marshall-Colón, Amy and McGrath, Justin M and Lochocki, Edward B and Long, Stephen P}, + title = {Soybean-BioCro: a semi-mechanistic model of soybean growth}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diab032}, + year = {2021}, + month = {12}, + doi = {10.1093/insilicoplants/diab032}, +} + +@article{he_seasonal_2023, + title = {Seasonal climate conditions impact the effectiveness of improving photosynthesis to increase soybean yield}, + journal = {Field Crops Research}, + volume = {296}, + pages = {108907}, + year = {2023}, + doi = {https://doi.org/10.1016/j.fcr.2023.108907}, + author = {Yufeng He and Megan L. Matthews}, +} + +@article{he_connecting_2024, + author = {He, Yufeng and Wang, Yu and Friedel, Douglas and Lang, Meagan and Matthews, Megan L}, + title = {Connecting detailed photosynthetic kinetics to crop growth and yield: a coupled modelling framework}, + journal = {in silico Plants}, + volume = {6}, + number = {2}, + pages = {diae009}, + year = {2024}, + month = {06}, + doi = {10.1093/insilicoplants/diae009}, +} From 4671308357fcf0c3b3398488dc98a3ecdcc0d04d Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Tue, 20 May 2025 17:44:02 -0500 Subject: [PATCH 03/13] Add more verbose printing --- R/objective_function.R | 3 ++- R/objective_function_input_checks.R | 19 ++++++++++++++++--- man/objective_function.Rd | 8 +------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/R/objective_function.R b/R/objective_function.R index d5dd158..1730e29 100644 --- a/R/objective_function.R +++ b/R/objective_function.R @@ -23,7 +23,8 @@ objective_function <- function( check_args_to_vary( independent_args, dependent_arg_function, - data_driver_pairs + data_driver_pairs, + verbose_startup ) # Get the model runners diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index 8df1f63..dd37fc9 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -148,7 +148,8 @@ check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { check_args_to_vary <- function( independent_args, dependent_arg_function, - data_driver_pairs + data_driver_pairs, + verbose ) { # Make sure the independent arguments have names @@ -168,6 +169,15 @@ check_args_to_vary <- function( } } + # Print argument names, if necessary + if (verbose) { + cat('\nThe independent arguments and their initial values:\n\n') + utils::str(independent_args) + + cat('\nThe dependent arguments and their initial values:\n\n') + utils::str(dependent_arg_function(independent_args)) + } + # Make sure no drivers were specified arg_names <- get_full_arg_names(independent_args, dependent_arg_function) @@ -402,12 +412,15 @@ check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { initial_error_terms <- obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) + initial_error <- sum(unlist(initial_error_terms)) + if (verbose) { cat('\nThe initial error metric terms:\n\n') utils::str(initial_error_terms) - } - initial_error <- sum(unlist(initial_error_terms)) + cat('\nThe initial error metric value:\n\n') + print(initial_error) + } if (!is.finite(initial_error)) { stop( diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 93c6dfe..6eac3d5 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -607,13 +607,7 @@ if (require(BioCro)) { # This function could now be passed to an optimizer; here we will simply # evaluate it for two sets of parameter values. - # First try the initial values. - cat('\nError metric calculated from initial argument values:\n') - print( - obj_fun(as.numeric(independent_args)) - ) - - # Now try doubling each parameter value; in this case, the value of the + # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. cat('\nError metric calculated by doubling the initial argument values:\n') From 3dd88824f0f0476a233076ce7b90e9ea56b9a2f1 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Wed, 21 May 2025 16:10:52 -0500 Subject: [PATCH 04/13] Add more detailed terms to output --- R/objective_function_helpers.R | 8 +++++++- man/objective_function.Rd | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 13770ec..3805ce8 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -348,8 +348,14 @@ error_from_res <- function( # Return the sum of the penalty and error terms, or the individual errors if (return_terms) { + error_terms_by_quantity <- as.list(tapply( + errors, + long_form_data_table[['quantity_name']], + sum + )) + list( - least_squares_term = error_sum, + least_squares_terms = error_terms_by_quantity, extra_penalty = penalty ) } else { diff --git a/man/objective_function.Rd b/man/objective_function.Rd index 6eac3d5..b606011 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -610,13 +610,13 @@ if (require(BioCro)) { # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between # the model and the observed data. - cat('\nError metric calculated by doubling the initial argument values:\n') + cat('\nError metric calculated by doubling the initial argument values:\n\n') print( obj_fun(2 * as.numeric(independent_args)) ) # We can also see the values of each term that makes up the error metric - cat('\nError metric terms calculated by doubling the initial argument values:\n') + cat('\nError metric terms calculated by doubling the initial argument values:\n\n') str( obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) ) From feeb07c1f849831e819c76fa36952b6b1a983cf9 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Wed, 21 May 2025 16:11:22 -0500 Subject: [PATCH 05/13] Finish calculating objective function in vignette --- vignettes/parameterizing_soybean_biocro.Rmd | 402 ++++++++++++++++++-- vignettes/references.bib | 11 + 2 files changed, 372 insertions(+), 41 deletions(-) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index b746a29..a0abc0d 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -30,11 +30,13 @@ parameterize BioCro's soybean model Since the original publication of Soybean-BioCro, the BioCro module library has undergone several changes, and the model has been re-parameterized several -times. These parameterizations have not used `BioCroValidation`, since they were +times. These parameterizations did not use `BioCroValidation`, since they were performed before `BioCroValidation` was created. -Here, we re-create the objective function that was used for the parameterization -included in version `3.2.0` of the BioCro R package. +However, `BioCroValidation` is able to re-create the objective functions that +were used for these parameterizations. Here, we re-create the objective function +that was used for the parameterization included in version `3.2.0` of the BioCro +R package. In the commands below, we will use functions from several libraries, so we will load them now: @@ -43,13 +45,32 @@ load them now: # Load required libraries library(BioCroValidation) library(BioCro) +library(lattice) ``` # Building the Objective Function In this section, we will use the `objective_function` function from `BioCroValidation` package to create an objective function that can be used to -parameterize Soybean-BioCro. +parameterize Soybean-BioCro. For more details about this, please see the help +page for `objective_function` by typing `?objective_function` from an R +terminal. + +## The Base Model Definition + +We first need a base model definition that includes the necessary modules, +initial values, parameters, and differential equation solver specifications. For +this example, we will simply use Soybean-BioCro as the base model, with just one +small change: we will use an Euler solver rather than the default solver, which +will help make the optimization run faster. For reasonable sets of parameter +values, the Euler solver does not seem to cause any substantial errors when +running Soybean-BioCro. + +```{r base_model_definition} +# Specify the base model definition +base_model_definition <- soybean +base_model_definition$ode_solver <- default_ode_solvers[['homemade_euler']] +``` ## The Observed Data @@ -83,6 +104,7 @@ which will be different for biomass values and standard deviations: # Define a helping function for calculating shell biomass add_shell_biomass <- function(data_table) { within(data_table, { + # The shell is all parts of the pod other than the seed Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha }) } @@ -90,11 +112,63 @@ add_shell_biomass <- function(data_table) { # Define a helping function for calculating shell biomass standard deviation add_shell_stdev <- function(data_table) { within(data_table, { + # Add uncertainties in quadrature, a simple approach to error propagation Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) }) } ``` +Although the observations do not include root biomass, it is nevertheless +important to constrain the predicted root mass to reasonable values. To do this, +it is assumed that the maximum root mass is seventeen percent of the maximum +aboveground biomass, and that it is achieved at the same time as maximum +above-ground biomass, based on observations reported in @ordonez_root_2020. In +the observed data, the sum of stem and leaf mass is largest at the fifth time +point in both years. So, root mass is estimated at this single time point and +added to the observed values. + +In previous parameterizations, a standard deviation for the root mass was not +explicitly estimated; instead, the standard-deviation-based weight factor was +implicitly set to 1. Because the `'logarithm'` method was used, a weight factor +of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See +the documentation page (`?objective_function`) for more information about this +weighting method. + +The root mass and its standard deviation can be added with another set of helper +functions. Note that any observations set to `NA` will be ignored when +calculating the error metric. + +```{r get_root} +# Define a helping function for adding the root mass +add_root_biomass <- function(data_table) { + # Initialize all values to NA + data_table$Root_Mg_per_ha <- NA + + # Estimate a mass at one time point + row_to_use <- 5 + + col_to_add <- c( + 'Leaf_Mg_per_ha', + 'Stem_Mg_per_ha', + 'Rep_Mg_per_ha' + ) + + data_table[row_to_use, 'Root_Mg_per_ha'] <- + 0.17 * sum(data_table[row_to_use, col_to_add]) + + data_table +} + +# Define a helping function for adding the root standard deviation +add_root_stdev <- function(data_table) { + # We can set a value for each time point; any time points where the root mass + # is NA will be ignored + data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + + data_table +} +``` + Finally, the data set includes some values that are not needed for the parameterization. This includes the leaf litter accumulated between each harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been @@ -105,6 +179,7 @@ helper function: # Define a helping function for removing unneeded columns remove_extra_columns <- function(data_table) { within(data_table, { + # Remove columns by setting them to NULL DOY = NULL Rep_Mg_per_ha = NULL Litter_Mg_per_ha = NULL @@ -115,40 +190,26 @@ remove_extra_columns <- function(data_table) { Now we can apply these to each table in the set: ```{r process_tables} -# Process the data sets -ambient_2002_biomass <- remove_extra_columns( - add_shell_biomass( - convert_time( - soyface_biomass[['ambient_2002']] - ) - ) -) - - -ambient_2005_biomass <- remove_extra_columns( - add_shell_biomass( - convert_time( - soyface_biomass[['ambient_2005']] - ) - ) -) - -ambient_2002_stdev <- remove_extra_columns( - add_shell_stdev( - convert_time( - soyface_biomass[['ambient_2002_std']] - ) - ) -) - - -ambient_2005_stdev <- remove_extra_columns( - add_shell_stdev( - convert_time( - soyface_biomass[['ambient_2005_std']] - ) - ) -) +# Process the data sets (biomass and stdev from 2002 and 2005) +ambient_2002_biomass <- + remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( + soyface_biomass[['ambient_2002']] + )))) + +ambient_2005_biomass <- + remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( + soyface_biomass[['ambient_2005']] + )))) + +ambient_2002_stdev <- + remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( + soyface_biomass[['ambient_2002_std']] + )))) + +ambient_2005_stdev <- + remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( + soyface_biomass[['ambient_2005_std']] + )))) ``` ## The Data-Driver Pairs @@ -178,27 +239,286 @@ data_driver_pairs <- list( Here we have chosen equal weights for the two years. +## The Post-Processing Function + +The observed data includes values of the total litter, which is comprised of +both leaf and stem litter. However, the model does not calculate this quntity; +instead, it returns separate values of leaf and stem litter. To address this +issue, we can provide a "post-processing function." This is an (optional) +function that is applied to each simulation result and can be used to add new +columns. Here we define such a function, which adds a new column for the total +litter: + +```{r post_process_function} +# Define the post-processing function +post_process_function <- function(sim_res) { + # Calculate the total litter as the sum of leaf and stem litter + within(sim_res, {TotalLitter = LeafLitter + StemLitter}) +} +``` + +## The Data Definitions + +The data sets above have columns whose names do not match the corresponding +model outputs. For example, the `Leaf_Mg_per_ha` column of the observed data +must be compared to the `Leaf` column of the model output, since both represent +the leaf mass per unit ground area. To handle this mismatch, we can provide a +set of "data definitions" that specify which columns should be compared: + +```{r data_definitions} +# Define the data definition list, where the element names are columns in the +# observed data tables, and the element values are the corresponding column +# names in the model outputs +data_definitions <- list( +# Observed Simulated + CumLitter_Mg_per_ha = 'TotalLitter', + Leaf_Mg_per_ha = 'Leaf', + Root_Mg_per_ha = 'Root', + Seed_Mg_per_ha = 'Grain', + Shell_Mg_per_ha = 'Shell', + Stem_Mg_per_ha = 'Stem' +) +``` + +## The Arguments to Vary + +Here we wish to vary several parameters related to carbon partitioning for +growth, senescence, maintenance respiration, and growth respiration: + +- For each growing tissue, there are two parameters (\(\alpha\) and \(\beta\)) + that influence the parbon partitioning coefficients. Here we will vary these + for the leaf, stem, and shell (6 parameters in total). + +- For each senescing tissue, there are three parameters (\(\alpha_{sen}\), + \(\beta_{sen}\), and `rate`) that influence when senescence begins and + the overall rate of scenescence. Here we will vary these for the leaf and stem + (6 parameters in total). + +- For each growing tissue, there is one parameter (`grc`) that influences the + rate of carbon use for growth respiration. Here we will vary these for the + stem and root (2 parameters in total). + +- For each tissue, there is one parameter (`mrc`) that influences the rate of + carbon use for maintenance respiration. Here we will vary these for the leaf, + stem, and root (3 parameters in total). + +Together, this is 17 arguments to vary. Typically, an optimization problem +requires more time for each free parameter involved, so it is helpful to vary +the smallest possible set. One way to reduce the number of free parameters is to +treat some as being "dependent." In other words, to calculate the values of some +parameters from the values of others, so that only some of them are truly free +or "independent." Here we will do this by fixing the value of `mrc_stem` to the +value of `mrc_leaf`. Thus, we can think of this is a single maintenance +respiration coefficient for the entire shoot; this reduces the number of +independent parameters by one (to 16). + +The independent arguments must be specified as a list of named numeric elements, +where the name is the argument name and the value is an initial guess for that +argument. Here we will use the default Soybean-BioCro values as our initial +guesses: + +```{r independent_args} +# Define a list of independent arguments and their initial values +independent_arg_names <- c( + # Partitioning for leaf, stem, and shell + 'alphaLeaf', + 'betaLeaf', + 'alphaStem', + 'betaStem', + 'alphaShell', + 'betaShell', + + # Senescence for leaf and stem + 'alphaSeneLeaf', + 'betaSeneLeaf', + 'rateSeneLeaf', + 'alphaSeneStem', + 'betaSeneStem', + 'rateSeneStem', + + # Growth respiration for stem and root + 'grc_stem', + 'grc_root', + + # Maintenance respiration for leaf and root + 'mrc_leaf', + 'mrc_root' +) + +independent_args <- soybean$parameters[independent_arg_names] +``` + +The dependent arguments must be specified as a function that takes a list of +independent arguments as its input, and returns a list of dependent arguments as +its output: + +```{r dependent_arg_function} +# Define a function that sets `mrc_stem` to the value of `mrc_leaf` +dependent_arg_function <- function(ind_args) { + list(mrc_stem = ind_args[['mrc_leaf']]) +} +``` + +## The Quantity Weights + +When determining the error metric value, we wish to assign different weights to +each type of observed value. This can be handled via the `quantity_weights`, +which must be a list of named numeric elements, where the name of each element +is an output from the simulation, and its value is the weight. + +```{r quantity_weights} +# Specify the quantity weights; there is no systematic way to determine these, +# but the following weights have worked well in the past for Soybean-BioCro +quantity_weights <- list( + Grain = 1.0, + Leaf = 1.0, + Root = 0.1, + Shell = 0.5, + Stem = 1.0, + TotalLitter = 0.1 +) +``` + +## The Extra Penalty Function + +Sometimes an optimizer may choose parameter values that produce close agreement +with the observed data but are nevertheless unreasonable from a biological +perspective. + +To prevent these unreasonable parameters from being chosen, "extra penalties" +can be added to the error metric. These penalties can be specified using an +`extra_penalty_function`, which must take the result from a BioCro simulation +as its input and return a numeric error penalty value, which generally should be +zero (when no issues are found) or a large positive number (if an issue has been +found). + +For Soybean-BioCro parameterization, three common issues are that: + +1. Carbon is never partitioned to one or more key tissues. + +2. Carbon partitioning to the stem and leaf starts at different times. + +3. Carbon partitioning to the leaves begins too early or too late. + +The function below will return a large value when any of these situations +occurs, and will otherwise return a value of zero. + +```{r extra_penalty_function} +# Define an extra penalty function +extra_penalty_function <- function(sim_res) { + # Set the penalty value + PENALTY <- 9999 + + # Get the first times when each partitioning coefficient becomes non-zero + k_thresh <- 0.01 # Threshold k value to decide when growth has started + hpd <- 24.0 # Hours per day + + time <- sim_res[['time']] + + time_grain <- time[sim_res[['kGrain']] > k_thresh][1] + time_leaf <- time[sim_res[['kLeaf']] > k_thresh][1] + time_shell <- time[sim_res[['kShell']] > k_thresh][1] + time_stem <- time[sim_res[['kStem']] > k_thresh][1] + + # Return a penalty if necessary + if (is.na(time_grain) | is.na(time_leaf) | is.na(time_shell) | is.na(time_stem)) { + # One or more tissues is not growing + return(PENALTY) + } else if (abs(time_leaf - time_stem) > 5 * hpd) { + # The starts of leaf and stem growth are more than 5 days apart + return(PENALTY) + } else if (time_leaf - time[1] > 20 * hpd | time_leaf - time[1] < 10 * hpd) { + # The start of leaf growth is too late (more than 20 days after sowing) or + # too early (fewer than 10 days after sowing) + return(PENALTY) + } else { + # No problems were detected + return(0.0) + } +} +``` + +## The Objective Function + +Now we are just about ready to build the objective function. There are a few +more details to discuss: + +- Soybean-BioCro has always used the `'mean_max'` method for determining + normalization factors; see Equations 14-16 of @matthews_soybean_biocro_2021 + for more details. + +- Soybean-BioCro has always used the `'logarithm'` method for determining + weights from standard deviations; see Equation 17 of + @matthews_soybean_biocro_2021 for more details. + +- Soybean-BioCro has not used any regularization. + +With this, it is possible to build the function. Note that some useful +information is printed out when the function is created, such as the full list +of observed values and their corresponding weights. + +```{r objective_function} +# Create the objective function +obj_fun <- objective_function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = data_definitions, + normalization_method = 'mean_max', + stdev_weight_method = 'logarithm', + regularization_method = 'none', + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function, + extra_penalty_function = extra_penalty_function +) +``` + # Commands From This Document ```{r, eval = FALSE} +### +### Preliminaries +### + <> ### ### Define helping functions ### -<<>> +<> <> -<> +<> + +<> ### -### Prepare inputs for `objective_function` +### Prepare inputs for `objective_function` and call it ### +<> + +<> + <> +<> + +<> + +<> + +<> + +<> + +<> + +<> + ``` # References diff --git a/vignettes/references.bib b/vignettes/references.bib index fec18e3..4534ebb 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -44,3 +44,14 @@ @article{he_connecting_2024 month = {06}, doi = {10.1093/insilicoplants/diae009}, } + +@article{ordonez_root_2020, + title = {Root to shoot and carbon to nitrogen ratios of maize and soybean crops in the {US} {Midwest}}, + volume = {120}, + doi = {10.1016/j.eja.2020.126130}, + journal = {European Journal of Agronomy}, + author = {Ordóñez, Raziel A. and Archontoulis, Sotirios V. and Martinez-Feria, Rafael and Hatfield, Jerry L. and Wright, Emily E. and Castellano, Michael J.}, + month = oct, + year = {2020}, + pages = {126130}, +} From efce402a206a3fb4c853f8a2e48cd517716a38a0 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 10:46:25 -0500 Subject: [PATCH 06/13] Add more error checking Make sure we can properly handle when a predicted value is NA, and that an error is thrown when an input argument value is NA. --- R/objective_function_helpers.R | 21 +++++++--- tests/testthat/test-objective_function.R | 52 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index 3805ce8..c2d1829 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -62,6 +62,13 @@ get_model_runner <- function( c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) } + if (any(!is.finite(x_for_partial))) { + stop( + 'At least one independent or dependent argument ', + 'value is not finite' + ) + } + initial_res <- partial_func(x_for_partial) if (is.null(post_process_function)) { @@ -272,13 +279,17 @@ one_error <- function( normalization ) { - qw <- if (predicted < observed) { - quantity_weight[1] # Underprediction + if (!is.finite(predicted)) { + NA } else { - quantity_weight[2] # Overprediction - } + qw <- if (predicted < observed) { + quantity_weight[1] # Underprediction + } else { + quantity_weight[2] # Overprediction + } - (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization + (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization + } } # Helping function for returning a failure value diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index d0fe2c3..91bdf60 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -382,6 +382,7 @@ test_that('Bad variance methods are detected', { }) test_that('Bad return values are detected', { + # A penalty evaluates to NA expect_error( objective_function( model, @@ -395,6 +396,20 @@ test_that('Bad return values are detected', { ), 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) + + # A predicted value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, {Pod = NA})}, + verbose_startup = verbose_startup + ), + 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' + ) }) test_that('Bad regularization methods are detected', { @@ -437,3 +452,40 @@ test_that('Bad data values and weights are detected', { fixed = TRUE ) }) + +test_that('NA argument values and predicted values are handled', { + # An independent argument value is NA + expect_error( + objective_function( + model, + ddps, + within(independent_args, {alphaLeaf = NA}), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) + + # A dependent argument value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + dependent_arg_function = function(x) {list(alphaStem = NA)}, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) +}) From d50a43125f86050b66488e9a0ae3276e08f1e991 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 11:34:54 -0500 Subject: [PATCH 07/13] Add debug mode option --- R/objective_function_helpers.R | 38 +++++++++++++++++++++++++++++++--- man/objective_function.Rd | 32 +++++++++++++++++----------- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R index c2d1829..ce32bef 100644 --- a/R/objective_function_helpers.R +++ b/R/objective_function_helpers.R @@ -403,7 +403,18 @@ get_obj_fun <- function( regularization_method ) { - function(x, lambda = 0, return_terms = FALSE) { + function(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE) { + if (debug_mode) { + msg <- paste0( + '\nTime: ', + Sys.time(), + ' Independent argument values: ', + paste(x, collapse = ', '), + '\n' + ) + cat(msg) + } + errors <- lapply(seq_along(model_runners), function(i) { runner <- model_runners[[i]] res <- runner(x) @@ -422,15 +433,36 @@ get_obj_fun <- function( reg_penalty <- regularization_penalty(x, regularization_method, lambda) if (return_terms) { - list( + error_metric_terms <- list( terms_from_data_driver_pairs = stats::setNames( errors, names(model_runners) ), regularization_penalty = reg_penalty ) + + if (debug_mode) { + cat(paste0('Time: ', Sys.time()), ' Error metric terms: ') + utils::str(error_metric_terms) + cat('\n') + } + + error_metric_terms } else { - sum(as.numeric(errors)) + reg_penalty + error_metric <- sum(as.numeric(errors)) + reg_penalty + + if (debug_mode) { + msg <- paste0( + 'Time: ', + Sys.time(), + ' Error metric: ', + error_metric, + '\n' + ) + cat(msg) + } + + error_metric } } } diff --git a/man/objective_function.Rd b/man/objective_function.Rd index b606011..4693d9c 100644 --- a/man/objective_function.Rd +++ b/man/objective_function.Rd @@ -468,7 +468,7 @@ \value{ A function \code{obj_fun} with signature - \code{obj_fun(x, lambda = 0, return_terms = FALSE)}. + \code{obj_fun(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE)}. Here, \code{x} is a numeric vector of values of the independent arguments (in the same order as in \code{independent_arg_names}), and \code{lambda} is the @@ -484,6 +484,15 @@ Setting it to \code{TRUE} can be useful for troubleshooting, or for diagnostics such as checking the quality of fit for each of the data-driver pairs. + + The \code{debug_mode} argument determines whether \code{obj_fun} is running in + debug mode. In debug mode, each time \code{obj_fun} is called, it will print + the values of \code{x} and the error metric to the R terminal. This can be + useful when troubleshooting a problem with an optimization, since it provides + a record of any problematic parameter combinations. When setting + \code{debug_mode} to \code{TRUE}, also consider using \code{\link[base]{sink}} + to write the outputs to a file instead of the R terminal. In that case, there + will still be a record even if R crashes. } \examples{ @@ -609,16 +618,15 @@ if (require(BioCro)) { # Try doubling each parameter value; in this case, the value of the # objective function increases, indicating a lower degree of agreement between - # the model and the observed data. - cat('\nError metric calculated by doubling the initial argument values:\n\n') - print( - obj_fun(2 * as.numeric(independent_args)) - ) - - # We can also see the values of each term that makes up the error metric - cat('\nError metric terms calculated by doubling the initial argument values:\n\n') - str( - obj_fun(2 * as.numeric(independent_args), return_terms = TRUE) - ) + # the model and the observed data. Here we will call `obj_fun` in debug mode, + # which will automatically print the value of the error metric. + cat('\nError metric calculated by doubling the initial argument values:\n') + error_metric <- obj_fun(2 * as.numeric(independent_args), debug_mode = TRUE) + + # We can also see the values of each term that makes up the error metric; + # again, we will call `obj_fun` in debug mode for automatic printing. + cat('\nError metric terms calculated by doubling the initial argument values:\n') + error_terms <- + obj_fun(2 * as.numeric(independent_args), return_terms = TRUE, debug_mode = TRUE) } } From 8e6374a031c7f0abdabab38d6efebefeef3d37f7 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 11:49:42 -0500 Subject: [PATCH 08/13] Add optimization example to vignette --- DESCRIPTION | 1 + vignettes/parameterizing_soybean_biocro.Rmd | 231 +++++++++++++++++++- 2 files changed, 230 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 72f0f01..607df58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Depends: Imports: BioCro (>= 3.2.0) Suggests: + dfoptim, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index a0abc0d..9f5d540 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -45,7 +45,7 @@ load them now: # Load required libraries library(BioCroValidation) library(BioCro) -library(lattice) +library(dfoptim) ``` # Building the Objective Function @@ -474,6 +474,217 @@ obj_fun <- objective_function( ) ``` +# Optimizing the Parameter Values + +The objective function is designed to be passed to a minimization algorithm, +which will determine the argument values that produce the best agreement between +the model predictions and the observations. + +Soybean-BioCro has already been parameterized, so there is already good +agreement between the model and the data. This can be seen by examining the +value of the error metric when using the default Soybean-BioCro values: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +default_error <- obj_fun(as.numeric(independent_args)) +``` + +This evaluates to `r default_error`. This is a low value for a Soybean-BioCro +parameterization, indicating that good agreement has already been found. + +Here, as an example, we will intentionally change each parameter value by a +small random amount, and then use an optimizer to improve the parameter values; +in an ideal world, the optimizer will eventually pick parameter values close to +the original Soybean-BioCro values. + +There are many optimizers available in R. Base R includes the `optim` function, +and others are available from the `dfoptim` and `DEoptim` packages. Here we will +use the `nmkb` optimizer from the `dfoptim` library, which requires upper and +lower bounds for each parameter and an initial guess. + +## Choosing an Initial Guess + +As mentioned above, we will intentionally choose a "bad" initial guess by +tweaking each parameter value by a small random amount. Note that we set a seed +to ensure the same result is obtained every time this is performed. Also note +that the initial guess must be a numeric vector, where the elements are ordered +as they are in `independent_args`. + +```{r initial_guess} +# Set a seed +set.seed(1234) + +# Make an initial guess by perturbing the default values by a small amount +rel_size <- 0.02 + +initial_guess <- as.numeric(independent_args) * + (1.0 + runif(length(independent_args), -rel_size, rel_size)) +``` + +Even though the changes to parameter values are small, there is still a +substantial change in the value of the error metric: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +initial_error <- obj_fun(initial_guess) +``` + +This evaluates to `r initial_error`, which is about +`r round(100 * (initial_error - default_error) / initial_error)` percent larger +than with the default parameter values. + +## Choosing Lower and Upper Bounds + +There is not always a systematic approach to choosing lower and upper bounds for +parameter values, but the following bounds have worked well for Soybean-BioCro +in the past: + +- The \(\alpha\) parameters used in partitioning and senescence calculations are + confined to the interval [0, 50]. + +- The \(\beta\) parameters used in partitioning and senescence calculations are + confined to the interval [-50, 0]. + +- The senescence rates have a lower bound of zero, but have different upper + bounds for each tissue. + +- The maintenance respiration coefficients are confined to the interval + [1e-6, 1e-2]. + +- The growth respiration coefficients must be positive and non-zero, but have + different bounds for each tissue. + +There are many possible ways to specify the bounds in R, but ultimately they +must be expressed as numeric vectors, where the elements are ordered as they are +in `independent_args`. Here we supply lower and upper bounds for each parameter +in a data frame and then ensure they are properly ordered. Later, the data frame +columns can be passed to the optimizer as the bounds: + +```{r setting_bounds} +# Define a table with the bounds +bounds <- read.table( + textConnection(' + arg_name lower upper + alphaLeaf +0 +50 + alphaStem +0 +50 + alphaShell +0 +50 + alphaSeneLeaf +0 +50 + alphaSeneStem +0 +50 + betaLeaf -50 +0 + betaStem -50 +0 + betaShell -50 +0 + betaSeneLeaf -50 +0 + betaSeneStem -50 +0 + rateSeneLeaf +0 +0.0125 + rateSeneStem +0 +0.005 + mrc_leaf +1e-6 +1e-2 + mrc_root +1e-6 +1e-2 + grc_stem +8e-4 +0.08 + grc_root +0.0025 +0.075 + '), + header = TRUE, + stringsAsFactors = FALSE +) + +# Make sure it is ordered properly +bounds <- bounds[match(independent_arg_names, bounds[['arg_name']]), ] +``` + +## Running the Optimizer + +Now we will use an optimizer to improve on the initial guess. As mentioned +above, we will use the `nmkb` optimizer from the `dfoptim` package. This is a +good choice when a decent starting guess is known. If a broader search is +necessary, `DEoptim` from the `DEoptim` package may be more appropriate, +although it generally needs more time to run. + +To make sure this example does not take too much time, we will use a loose +tolerance; a more realistic example would probably use `1e-4` or `1e-5`. + +```{r run_optimizer, eval = FALSE} +# Run the optimizer +optim_res <- nmkb( + initial_guess, + obj_fun, + bounds[['lower']], + bounds[['upper']], + control = list( + tol = 1e-2, + restarts.max = 10 + ), + debug_mode = FALSE # Passed to obj_fun; set to TRUE to enable debug mode +) +``` + +```{r, echo = FALSE} +timing <- system.time({ +<> +}) +``` + +When this document was generated, running the optimizer required the following +amount of time: + +```{r, echo = FALSE} +print(timing) +``` + +The total time was about `r round(timing[3] / 60, 2)` minutes. For a real +parameterization problem, it can be many times longer, and may even need days to +run on a personal laptop computer. + +The optimizer also reports how many times the objective function was called, +among other details: + +```{r} +str(optim_res) +``` + +The value of `feval` is `r optim_res[['feval']]`, so on average, each call of +the objective function required approximately +`r round(timing[3] / optim_res[['feval']], 3)` seconds. + +## Comparing Parameter Values + +Let's see whether the optimized parameters are closer to the default parameters +than the initial guess was. + +```{r compare_param} +# Create a table of the various independent argument values +ind_arg_table <- data.frame( + arg_name = independent_arg_names, + defaults = as.numeric(independent_args), + initial_guess = initial_guess, + optimized = optim_res[['par']], + stringsAsFactors = FALSE +) + +# Add differences +ind_arg_table <- within(ind_arg_table, { + initial_diff = abs(initial_guess - defaults) + optimized_diff = abs(optimized - defaults) + improved = optimized_diff < initial_diff +}) + +# View results +print(ind_arg_table) +``` + +In this table, when the `improved` column is `TRUE`, this means that the +optimized parameter value is closer to the default value than the initial guess +was; in other words, it means that the optimizer truly improved on the initial +guess. In this example, `r sum(ind_arg_table[['improved']])` out of +`r nrow(ind_arg_table)` parameter estimates were improved +(`r round(100 * sum(ind_arg_table[['improved']]) / nrow(ind_arg_table))` percent). + +We can also compare the error metric to its original value. As shown above, it +is now `r optim_res[['value']]`, which is only +`r round(100 * (optim_res[['value']] - default_error) / optim_res[['value']], 1)` +percent larger than with the default parameter values. + +The optimized parameter values could likely be improved by using a more +stringent tolerance for the optimizer, but this would require more time to run. + # Commands From This Document ```{r, eval = FALSE} @@ -496,7 +707,7 @@ obj_fun <- objective_function( <> ### -### Prepare inputs for `objective_function` and call it +### Prepare inputs for `objective_function` and create an objective function ### <> @@ -519,6 +730,22 @@ obj_fun <- objective_function( <> +### +### Use an optimizer to choose parameter values +### + +<> + +<> + +<> + +### +### Check and record the new values +### + +<> + ``` # References From a64c7cd64da317163bd398b5e4dbe2d1079de374 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 16:15:43 -0500 Subject: [PATCH 09/13] Add `update_model` and finish vignette example --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 8 +- R/update_model.R | 80 ++++++++++++ man/update_model.Rd | 70 +++++++++++ tests/testthat/test-update_model.R | 83 +++++++++++++ vignettes/parameterizing_soybean_biocro.Rmd | 128 ++++++++++++++++++++ 7 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 R/update_model.R create mode 100644 man/update_model.Rd create mode 100644 tests/testthat/test-update_model.R diff --git a/DESCRIPTION b/DESCRIPTION index 607df58..4dfa230 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,7 @@ Imports: BioCro (>= 3.2.0) Suggests: dfoptim, + lattice, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/NAMESPACE b/NAMESPACE index ba5cd56..ad14bd8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,3 @@ export(objective_function) +export(update_model) export(write_model) diff --git a/NEWS.md b/NEWS.md index 8141b4c..b13d94e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,7 +37,13 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. -- Added the `objective_function` function. +- Added several new functions: + + - `objective_function` + + - `update_model` + +- Added a vignette illustrating how to perform a model parameterization. # Changes in BioCroValidation Version 0.1.0 diff --git a/R/update_model.R b/R/update_model.R new file mode 100644 index 0000000..1a5d32a --- /dev/null +++ b/R/update_model.R @@ -0,0 +1,80 @@ +update_model <- function( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL +) +{ + # Make sure the model definition has initial_values and parameters + req_elements <- c('initial_values', 'parameters') + if (!all(req_elements %in% names(base_model_definition))) { + stop( + 'The `base_model_definition` must have the following elements: ', + paste(req_elements, collapse = ', ') + ) + } + + # Make sure the argument lists have the same length + if (length(independent_args) != length(new_ind_arg_values)) { + stop('`independent_args` and `new_ind_arg_values` must have the same length') + } + + # Update the values of the independent arguments + new_independent_args <- stats::setNames( + as.list(new_ind_arg_values), + names(independent_args) + ) + + # Also get the values of the dependent arguments + all_args <- if (!is.null(dependent_arg_function)) { + c(new_independent_args, dependent_arg_function(new_independent_args)) + } else { + new_independent_args + } + + # Find all quantities in the initial values and parameters and store them in + # a list + iv <- as.list( + rep_len( + 'initial_values', + length(base_model_definition[['initial_values']]) + ) + ) + names(iv) <- names(base_model_definition[['initial_values']]) + + param <- as.list( + rep_len( + 'parameters', + length(base_model_definition[['parameters']]) + ) + ) + names(param) <- names(base_model_definition[['parameters']]) + + model_quantities <- c(iv, param) + + # Make sure each supplied argument is included in the model + not_in_model <- !names(all_args) %in% names(model_quantities) + + if (any(not_in_model)) { + msg <- paste0( + 'Values were supplied for the following quantities, but they ', + 'are not `initial_values` or `parameters` of ', + 'the `base_model_definition`: ', + paste(names(all_args)[not_in_model], collapse = ', ') + ) + stop(msg) + } + + # Make a copy of the model with the new argument values and return it + new_model_definition <- base_model_definition + + for (i in seq_along(all_args)) { + arg_name <- names(all_args)[i] + arg_type <- model_quantities[[arg_name]] + arg_value <- all_args[[i]] + + new_model_definition[[arg_type]][[arg_name]] <- arg_value + } + + new_model_definition +} diff --git a/man/update_model.Rd b/man/update_model.Rd new file mode 100644 index 0000000..f3a86ae --- /dev/null +++ b/man/update_model.Rd @@ -0,0 +1,70 @@ +\name{update_model} + +\alias{update_model} + +\title{Update a BioCro model definition} + +\description{ + Following an optimization, it is typically necessary to update the initial + values and/or parameters of a base model definition with new values determined + during the optimization. The \code{update_model} function helps to streamline + this process. It is expected that this function will be called after + \code{\link{objective_function}}. +} + +\usage{ + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL + ) +} + +\arguments{ + \item{base_model_definition}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{new_ind_arg_values}{ + A numeric vector representing new values of the independent arguments, + typically determined by an optimizer. + } + + \item{dependent_arg_function}{ + The same value that was passed to \code{\link{objective_function}}. + } +} + +\value{ + A list based on \code{base_model_definition} but with new values of some of + its \code{initial_values} and \code{parameters}, as specified by the elements + of \code{independent_args} and \code{new_ind_arg_values}. +} + +\examples{ +if (require(BioCro)) { + # Update the default Soybean-BioCro model with new values of `Leaf` (an + # initial value) and `alphaStem` (a parameter) + base_model <- BioCro::soybean + + new_model <- update_model( + base_model, + list(Leaf = 1, alphaLeaf = 2), # The values here will not be used + c(1000, 2000) # These are the actual new values + ) + + # Compare the two models + cat('\n\nComparing initial Leaf values:\n') + print(base_model$initial_values$Leaf) + print(new_model$initial_values$Leaf) + + cat('\n\nComparing alphaLeaf values:\n') + print(base_model$parameters$alphaLeaf) + print(new_model$parameters$alphaLeaf) +} +} diff --git a/tests/testthat/test-update_model.R b/tests/testthat/test-update_model.R new file mode 100644 index 0000000..93d74eb --- /dev/null +++ b/tests/testthat/test-update_model.R @@ -0,0 +1,83 @@ +NEWVAL <- 10000 + +base_model_definition <- BioCro::soybean +independent_args <- list(alphaLeaf = NEWVAL, Leaf = NEWVAL) +new_ind_arg_values <- c(NEWVAL, NEWVAL) +dependent_arg_function <- function(x) {list(alphaStem = x$alphaLeaf)} + +test_that('A base model definition can be updated', { + # Without dependent arguments + expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values + ) + ) + + # With dependent arguments + new_model <- expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ) + ) + + # Initial values are updated + expect_equal( + new_model[['initial_values']][['Leaf']], + NEWVAL + ) + + # Other initial values remain the same + expect_equal( + new_model[['initial_values']][['Stem']], + base_model_definition[['initial_values']][['Stem']] + ) + + # Dependent parameters are updated + expect_equal( + new_model[['parameters']][['alphaStem']], + NEWVAL + ) + + # Other parameters remain the same + expect_equal( + new_model[['parameters']][['alphaRoot']], + base_model_definition[['parameters']][['alphaRoot']] + ) +}) + +test_that('Base model definition must be valid', { + expect_error( + update_model( + base_model_definition[c('direct_modules', 'differential_modules')], + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ), + 'The `base_model_definition` must have the following elements: initial_values, parameters' + ) +}) + +test_that('Supplied arguments must be part of the base model definition', { + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + new_ind_arg_values + ), + '`independent_args` and `new_ind_arg_values` must have the same length' + ) + + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + c(new_ind_arg_values, 25) + ), + 'Values were supplied for the following quantities, but they are not `initial_values` or `parameters` of the `base_model_definition`: bad_arg' + ) +}) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 9f5d540..5be249e 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -46,6 +46,7 @@ load them now: library(BioCroValidation) library(BioCro) library(dfoptim) +library(lattice) ``` # Building the Objective Function @@ -685,6 +686,125 @@ percent larger than with the default parameter values. The optimized parameter values could likely be improved by using a more stringent tolerance for the optimizer, but this would require more time to run. +# Comparing Model Outputs + +Another way to evaluate the results of the optimization is to compare +simulations using the default, perturbed, and re-optimized versions of the +model. + +Following the re-parameterization, we now have new values of the independent +arguments, but this is not enough to actually run the new version of the model. +Thus, the next step is to form a new model definition by updating the values of +the default soybean model. This can be accomplished using the `update_model` +function from `BioCroValidation`: + +```{r form_new_models} +# Get model definition lists for the perturbed and re-parameterized versions of +# the soybean model +soybean_perturbed <- update_model( + BioCro::soybean, + independent_args, + initial_guess, + dependent_arg_function = dependent_arg_function +) + +soybean_reparam <- update_model( + BioCro::soybean, + independent_args, + optim_res[['par']], + dependent_arg_function = dependent_arg_function +) +``` + +We can check that the three models have different values of key parameters, such +as the "dependent" argument `mrc_stem`: + +```{r} +print(BioCro::soybean$parameters$mrc_stem) + +print(soybean_perturbed$parameters$mrc_stem) + +print(soybean_reparam$parameters$mrc_stem) +``` + +Now we can run each version of the model for a single year and visually compare +their outputs: + +```{r comparison_plots} +# Define a helper function that runs a single model for the year 2005 +run_2005 <- function(model_definition) { + with(model_definition, {run_biocro( + initial_values, + parameters, + soybean_weather[['2005']], + direct_modules, + differential_modules, + ode_solver + )}) +} + +# Run each model and combine the results +full_res <- rbind( + within(run_2005(BioCro::soybean), {model = 'Default Soybean-BioCro'}), + within(run_2005(soybean_perturbed), {model = 'Perturbed Soybean-BioCro'}), + within(run_2005(soybean_reparam), {model = 'Re-parameterized Soybean-BioCro'}) +) + +# Plot the results +xyplot( + Leaf + Stem + Root + Grain ~ fractional_doy, + group = model, + data = full_res, + type = 'l', + auto.key = list(space = 'top'), + xlab = 'Day of year (2005)', + ylab = 'Biomass (Mg / ha)' +) +``` + +Here we can see that while the simulated stem, root, and grain (seed) biomass +values do not differ much between the models, there are large differences in the +leaf mass, where the default and re-optimized versions are similar and the +perturbed version predicts much smaller values. + +## Saving the New Model Definition + +A realistic parameterization takes a long time to complete, so it is important +to save the results for later use. One approach is to save the model definition +list using the `save` or `saveRDS` functions from base R. However, these options +create binary files that are not human-readable, and they cannot be easily +tracked using `git`. As an alternative, the `BioCroValidation` includes a +function called `write_model` that forms a string representing an R command that +can be called to re-create a model definition. This command string can be +written to a text file, making it easy to read and to track with `git`. + +Here we apply `write_model` to the re-optimized soybean model: + +```{r write_model_command} +# Convert the re-parameterized soybean model to an R command string +r_cmd_string <- with(soybean_reparam, write_model( + 'soybean_reparam', + direct_modules, + differential_modules, + initial_values, + parameters, + ode_solver +)) +``` + +We can view the resulting R command string: + +```{r} +writeLines(r_cmd_string) +``` + +It can also be written to a text file: + +```{r write_model_to_file, eval = FALSE} +# Save the model definition as an R file in the current working directory +writeLines(r_cmd_string, './soybean_reparam.R') +``` + # Commands From This Document ```{r, eval = FALSE} @@ -746,6 +866,14 @@ stringent tolerance for the optimizer, but this would require more time to run. <> +<> + +<> + +<> + +<> + ``` # References From 388dc5bb144f9cac54e47f9e7c13c2c55a221501 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 18:16:12 -0500 Subject: [PATCH 10/13] Add `bounds_table` and use it in the vignette --- NAMESPACE | 1 + NEWS.md | 7 +- R/bounds_table.R | 154 ++++++++++++++++++++ man/bounds_table.Rd | 118 +++++++++++++++ tests/testthat/test-bounds_table.R | 98 +++++++++++++ vignettes/parameterizing_soybean_biocro.Rmd | 63 ++++---- 6 files changed, 407 insertions(+), 34 deletions(-) create mode 100644 R/bounds_table.R create mode 100644 man/bounds_table.Rd create mode 100644 tests/testthat/test-bounds_table.R diff --git a/NAMESPACE b/NAMESPACE index ad14bd8..6c90842 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,3 +1,4 @@ +export(bounds_table) export(objective_function) export(update_model) export(write_model) diff --git a/NEWS.md b/NEWS.md index b13d94e..ab5e3fa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,11 +37,8 @@ be directly added to this file to describe the related changes. - Added 2002 and 2005 SoyFACE biomass data. -- Added several new functions: - - - `objective_function` - - - `update_model` +- Added several new functions: `objective_function`, `update_model`, and + `bounds_table` - Added a vignette illustrating how to perform a model parameterization. diff --git a/R/bounds_table.R b/R/bounds_table.R new file mode 100644 index 0000000..2a7e407 --- /dev/null +++ b/R/bounds_table.R @@ -0,0 +1,154 @@ +# A helper function for checking the bounds list for mistakes; if an issue is +# found, this function will throw an error; otherwise it will be silent with no +# return value. +check_bounds_list <- function(bounds_list, independent_args) { + # Must be a list of named elements + if (!is.list(bounds_list) | is.null(names(bounds_list))) { + stop('`bounds_list` must be a list of named elements') + } + + # Must contain all elements in independent_args + missing_element <- sapply(names(independent_args), function(x) { + !x %in% names(bounds_list) + }) + + if (any(missing_element)) { + msg <- paste0( + 'The following elements were included in ', + '`independent_args` but not `bounds_list`: ', + paste(names(independent_args)[missing_element], collapse = ', ') + ) + stop(msg) + } + + # Each element must have length 2 + length_two <- sapply(bounds_list, function(x) { + xlen <- length(x) + + if (is.finite(xlen)) { + length(x) == 2 + } else { + FALSE + } + }) + + if (any(!length_two)) { + msg <- paste0( + 'The following elements of `bounds_list` do not have a length of 2: ', + paste(names(bounds_list)[!length_two], collapse = ', ') + ) + stop(msg) + } + + # Each element must be numeric + not_numeric <- sapply(bounds_list, function(x) {!is.numeric(x)}) + + if (any(not_numeric)) { + msg <- paste0( + 'The following elements of `bounds_list` are not numeric: ', + paste(names(bounds_list)[not_numeric], collapse = ', ') + ) + stop(msg) + } + + return(invisible(NULL)) +} + +# A helper function for checking the initial guess for mistakes; if an issue is +# found, this function will throw an error or a warning; otherwise it will be +# silent with no return value. +check_initial_ind_arg_values <- function( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values +) +{ + # Check the length + if (length(initial_ind_arg_values) != length(independent_args)) { + stop('`initial_ind_arg_values` must have the same length as `independent_args`') + } + + # Check to make sure the initial values are not outside the bounds + outside_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + initial_ind_arg_values[i] < lbounds[i] | initial_ind_arg_values[i] > ubounds[i] + }) + + if (any(outside_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie outside the bounds: ', + paste(names(independent_args)[outside_bounds], collapse = ', ') + ) + stop(msg) + } + + # Check to see if any initial values are on the bounds + eps <- sqrt(.Machine$double.eps) + + on_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + abs(initial_ind_arg_values[i] - lbounds[i]) < eps | + abs(initial_ind_arg_values[i] - ubounds[i]) < eps + }) + + if (any(on_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie on the ', + 'bounds, which can be problematic for some optimizers: ', + paste(names(independent_args)[on_bounds], collapse = ', ') + ) + warning(msg) + } + + return(invisible(NULL)) +} + +bounds_table <- function( + independent_args, + bounds_list, + initial_ind_arg_values = NULL +) +{ + # Check the bounds_list + check_bounds_list(bounds_list, independent_args) + + # Get an ordering for the elements of `bounds_list` so they match the order + # of elements in `independent_args`; note that this will also exclude any + # elements of `bounds_list` that are not included in `independent_args`. + ordering <- match( + names(independent_args), + names(bounds_list) + ) + + bounds_list <- bounds_list[ordering] + + # Get the lower and upper bounds + lbounds <- sapply(bounds_list, min) + ubounds <- sapply(bounds_list, max) + + # Form the bounds table + bounds_table <- data.frame( + arg_name = names(independent_args), + lower = lbounds, + upper = ubounds, + stringsAsFactors = FALSE + ) + + # Include initial values in the table if they were provided + if (!is.null(initial_ind_arg_values)) { + # Check the values + check_initial_ind_arg_values( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values + ) + + # Include the values + bounds_table$initial_value <- initial_ind_arg_values + } + + # Remove row names and return the table + rownames(bounds_table) <- NULL + + bounds_table +} diff --git a/man/bounds_table.Rd b/man/bounds_table.Rd new file mode 100644 index 0000000..b833c59 --- /dev/null +++ b/man/bounds_table.Rd @@ -0,0 +1,118 @@ +\name{bounds_table} + +\alias{bounds_table} + +\title{Create a table of lower and upper bounds} + +\description{ + During an optimization, it is often necessary to provide lower and upper + bounds for the parameters that are being varied. Typically they are specified + as numeric vectors, which often leads to confusing code, where the writer and + reader must remember which value corresponds to each argument; for example, + "the third element of the lower bound vector is for alphaLeaf." + + The purpose of \code{bounds_table} is to make the process of specifying bounds + simpler and easier to follow. It is expected that this function will be called + after \code{\link{objective_function}}. +} + +\usage{ + bounds_table( + independent_args, + bounds_list, + initial_ind_arg_values = NULL + ) +} + +\arguments{ + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{bounds_list}{ + A list of named elements, where each element is a numeric vector of length + 2. The names should correspond to the independent arguments, and the values + should indicate lower and upper bounds for the corresponding parameter (in + any order). Any "extra" bounds (that is, bounds that do not correspond to + any independent argument) will be ignored. + } + + \item{initial_ind_arg_values}{ + A numeric vector of initial values for each of the independent arguments, + supplied in the same order as in \code{independent_args}. + } +} + +\details{ + The main purpose of this function is to create vectors of lower and upper + bounds, which are returned as the columns of a data frame. For each + independent argument in \code{independent_args}, the bounds are supplied via + the \code{bounds_list} input. The syntax is designed so the code calling this + function is easy for a human to parse. (See example below.) + + It is also (optionally) possible to provide an initial guess for each + independent argument via the \code{initial_ind_arg_values} argument. When + provided, these will be checked to make sure they do not lie outside the + bounds; an error will be thrown if any do lie outside the bounds. A warning + will also be thrown if any initial guesses lie on the bounds, since this can + be problematic for some optimizers, such as \code{\link[dfoptim]{nmkb}}. +} + +\value{ + A data frame with three or four columns: \code{arg_name}, \code{lower}, + \code{upper}, and (optionally) \code{initial_value}. + + The \code{lower} and \code{upper} columns are the lower and upper bounds, + determined from \code{bounds_list}. The \code{arg_name} column is the argument + name, and the rows of the table are ordered as in \code{independent_args}. + The \code{initial_value} column contains initial values, if they were + provided via the \code{initial_ind_arg_values} argument. +} + +\examples{ +# Make a list of independent arguments; the values are not used for anything +independent_args <- list( + alphaLeaf = 0, + alphaRoot = 0, + alphaStem = 0, + betaLeaf = 0, + betaRoot = 0, + betaStem = 0 +) + +# Specify bounds and initial guess for each. Note that: +# +# - The bounds will be reordered to follow the same order as the +# `independent_args`, but the initial guess is assumed to already follow the +# same order as the `independent_args`. +# +# - The bounds for the two extra parameters are ignored when forming the table. +# +# - The lower and upper bounds can be supplied as (upper, lower) +# or (lower, upper) pairs. +# +b_ll <- -50 # Lower limit for beta parameters +a_ul <- 50 # Upper limit for alpha parameters + +bounds <- bounds_table( + independent_args, + list( + betaStem = c(0, b_ll), + betaRoot = c(0, b_ll), + betaLeaf = c(0, b_ll), + alphaStem = c(0, a_ul), + alphaRoot = c(0, a_ul), + alphaLeaf = c(0, a_ul), + extraPar1 = c(0, 5), + extraPar2 = c(0, 6) + ), + c(1, 1, 1, -1, -1, -1) +) + +print(bounds) + +# Now the properly-ordered lower and upper limits can be accessed as follows: +bounds$lower + +bounds$lower +} diff --git a/tests/testthat/test-bounds_table.R b/tests/testthat/test-bounds_table.R new file mode 100644 index 0000000..7790528 --- /dev/null +++ b/tests/testthat/test-bounds_table.R @@ -0,0 +1,98 @@ +independent_args <- list(param1 = 0.1, param2 = 0.2, param3 = 0.3) + +bounds_list <- list( + param1 = c(0, 1), + param3 = c(4, 3), + param2 = c(1, 2), + param4 = c(7, 8) +) + +test_that('Bounds tables can be created', { + # No initial values + expect_silent( + bounds_table( + independent_args, + bounds_list + ) + ) + + # Initial values + bounds <- expect_silent( + bounds_table( + independent_args, + bounds_list, + c(0.5, 1.5, 3.5) + ) + ) + + expect_equal( + bounds[['arg_name']], + c('param1', 'param2', 'param3') + ) + + expect_equal( + bounds[['lower']], + c(0, 1, 3) + ) + + expect_equal( + bounds[['upper']], + c(1, 2, 4) + ) +}) + +test_that('Values outside the bounds are detected', { + expect_error( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.5, 3.5) + ), + 'The initial values for the following arguments lie outside the bounds: param2' + ) +}) + +test_that('Values on the bounds are detected', { + expect_warning( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.0, 3.5) + ), + 'The initial values for the following arguments lie on the bounds, which can be problematic for some optimizers: param2' + ) +}) + +test_that('Missing bounds are detected', { + expect_error( + bounds_table( + independent_args, + 1.0 + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + list(1, 2, 3) + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + bounds_list[1:2] + ), + 'The following elements were included in `independent_args` but not `bounds_list`: param2' + ) + + expect_error( + bounds_table( + independent_args, + within(bounds_list, {param1 = 1.0}) + ), + 'The following elements of `bounds_list` do not have a length of 2: param1' + ) +}) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index 5be249e..dd48bbc 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -557,38 +557,43 @@ in the past: There are many possible ways to specify the bounds in R, but ultimately they must be expressed as numeric vectors, where the elements are ordered as they are -in `independent_args`. Here we supply lower and upper bounds for each parameter -in a data frame and then ensure they are properly ordered. Later, the data frame -columns can be passed to the optimizer as the bounds: +in `independent_args`. Here we use the `bounds_table` function from +`BioCroValidation` to create a data frame where the lower and upper bounds are +stored as columns. Later, the columns can be passed to the optimizer. The +`bounds_table` function will also check the initial guess to ensure it lies +within the bounds; for more information about this function, see its help page +by typing `?bounds_table` from an R terminal. ```{r setting_bounds} -# Define a table with the bounds -bounds <- read.table( - textConnection(' - arg_name lower upper - alphaLeaf +0 +50 - alphaStem +0 +50 - alphaShell +0 +50 - alphaSeneLeaf +0 +50 - alphaSeneStem +0 +50 - betaLeaf -50 +0 - betaStem -50 +0 - betaShell -50 +0 - betaSeneLeaf -50 +0 - betaSeneStem -50 +0 - rateSeneLeaf +0 +0.0125 - rateSeneStem +0 +0.005 - mrc_leaf +1e-6 +1e-2 - mrc_root +1e-6 +1e-2 - grc_stem +8e-4 +0.08 - grc_root +0.0025 +0.075 - '), - header = TRUE, - stringsAsFactors = FALSE +# Specify some bounds +aul <- 50 # Upper limit for alpha parameters +bll <- -50 # Lower limit for beta parameters +mll <- 1e-6 # Lower limit for mrc parameters +mul <- 1e-2 # Upper limit for mrc parameters + +# Define a table with the bounds in the same order as `independent_args` +bounds <- bounds_table( + independent_args, + list( + alphaLeaf = c(0, aul), + alphaStem = c(0, aul), + alphaShell = c(0, aul), + alphaSeneLeaf = c(0, aul), + alphaSeneStem = c(0, aul), + betaLeaf = c(bll, 0), + betaStem = c(bll, 0), + betaShell = c(bll, 0), + betaSeneLeaf = c(bll, 0), + betaSeneStem = c(bll, 0), + rateSeneLeaf = c(0, 0.0125), + rateSeneStem = c(0, 0.005), + mrc_leaf = c(mll, mul), + mrc_root = c(mll, mul), + grc_stem = c(8e-4, 0.08), + grc_root = c(0.0025, 0.075) + ), + initial_guess ) - -# Make sure it is ordered properly -bounds <- bounds[match(independent_arg_names, bounds[['arg_name']]), ] ``` ## Running the Optimizer From ffd4f85a2a89dee8111e05cf811385bfa3165c94 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 18:51:00 -0500 Subject: [PATCH 11/13] Simplify vignette a bit --- vignettes/parameterizing_soybean_biocro.Rmd | 182 +++++++------------- 1 file changed, 61 insertions(+), 121 deletions(-) diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd index dd48bbc..3db1dea 100644 --- a/vignettes/parameterizing_soybean_biocro.Rmd +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -83,41 +83,11 @@ pre-processing to get it ready. One issue is that the data set specifies the doy of year (DOY) for each harvest, but we need to specify the time using BioCro's convention (the number of hours -since the start of the year). This can be addressed by defining a helper -function that adds a new `time` column following BioCro's convention: - -```{r convert_time} -# Define a helping function for adding a `time` column -convert_time <- function(data_table) { - within(data_table, { - # Define new `time` column - time = (DOY - 1) * 24.0 - }) -} -``` +since the start of the year). Another issue is that the data set includes pod and seed values, but Soybean-BioCro calculates shell and seed masses, where the shell and seed -together comprise the pod. This can also be addressed with helper functions, -which will be different for biomass values and standard deviations: - -```{r get_shell} -# Define a helping function for calculating shell biomass -add_shell_biomass <- function(data_table) { - within(data_table, { - # The shell is all parts of the pod other than the seed - Shell_Mg_per_ha = Rep_Mg_per_ha - Seed_Mg_per_ha - }) -} - -# Define a helping function for calculating shell biomass standard deviation -add_shell_stdev <- function(data_table) { - within(data_table, { - # Add uncertainties in quadrature, a simple approach to error propagation - Shell_Mg_per_ha = sqrt(Rep_Mg_per_ha^2 + Seed_Mg_per_ha^2) - }) -} -``` +together comprise the pod. Although the observations do not include root biomass, it is nevertheless important to constrain the predicted root mass to reasonable values. To do this, @@ -135,82 +105,60 @@ of 1 implies a standard deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See the documentation page (`?objective_function`) for more information about this weighting method. -The root mass and its standard deviation can be added with another set of helper -functions. Note that any observations set to `NA` will be ignored when -calculating the error metric. - -```{r get_root} -# Define a helping function for adding the root mass -add_root_biomass <- function(data_table) { - # Initialize all values to NA - data_table$Root_Mg_per_ha <- NA +Finally, the data set includes some values that are not needed for the +parameterization. This includes the leaf litter accumulated between each +harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been +superseded by other columns defined above. - # Estimate a mass at one time point - row_to_use <- 5 +Here we will define a helping function that can accomplish the required +modifications described above; note that some operations are different depending +on whether the table represents biomass values or standard deviations: - col_to_add <- c( - 'Leaf_Mg_per_ha', - 'Stem_Mg_per_ha', - 'Rep_Mg_per_ha' - ) +```{r process_tables} +# Define a helping function for processing data tables +process_table <- function(data_table, type) { + # Define new `time` column + data_table$time <- (data_table$DOY - 1) * 24.0 - data_table[row_to_use, 'Root_Mg_per_ha'] <- - 0.17 * sum(data_table[row_to_use, col_to_add]) + # Define new `Shell_Mg_per_ha` column + data_table$Shell_Mg_per_ha <- if (type == 'biomass') { + # The shell is all parts of the pod other than the seed + data_table$Rep_Mg_per_ha - data_table$Seed_Mg_per_ha + } else { + # Add uncertainties in quadrature, a simple approach to error propagation + sqrt(data_table$Rep_Mg_per_ha^2 + data_table$Seed_Mg_per_ha^2) + } - data_table -} + # Define new `Root_Mg_per_ha` column + if (type == 'biomass') { + # Initialize all values to NA + data_table$Root_Mg_per_ha <- NA -# Define a helping function for adding the root standard deviation -add_root_stdev <- function(data_table) { - # We can set a value for each time point; any time points where the root mass - # is NA will be ignored - data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + # Estimate a mass at one time point + row_to_use <- 5 - data_table -} -``` + col_to_add <- c( + 'Leaf_Mg_per_ha', + 'Stem_Mg_per_ha', + 'Rep_Mg_per_ha' + ) -Finally, the data set includes some values that are not needed for the -parameterization. This includes the leaf litter accumulated between each -harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been -superseded by other columns defined above. These can be removed with a final -helper function: - -```{r remove_columns} -# Define a helping function for removing unneeded columns -remove_extra_columns <- function(data_table) { - within(data_table, { - # Remove columns by setting them to NULL - DOY = NULL - Rep_Mg_per_ha = NULL - Litter_Mg_per_ha = NULL - }) -} -``` + data_table[row_to_use, 'Root_Mg_per_ha'] <- + 0.17 * sum(data_table[row_to_use, col_to_add]) + } else { + # We can set a value for each time point; any time points where the root + # mass is NA will be ignored + data_table$Root_Mg_per_ha <- 1 / exp(1) - 1e-5 + } -Now we can apply these to each table in the set: + # Remove columns by setting them to NULL + data_table$DOY = NULL + data_table$Rep_Mg_per_ha = NULL + data_table$Litter_Mg_per_ha = NULL -```{r process_tables} -# Process the data sets (biomass and stdev from 2002 and 2005) -ambient_2002_biomass <- - remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( - soyface_biomass[['ambient_2002']] - )))) - -ambient_2005_biomass <- - remove_extra_columns(add_root_biomass(add_shell_biomass(convert_time( - soyface_biomass[['ambient_2005']] - )))) - -ambient_2002_stdev <- - remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( - soyface_biomass[['ambient_2002_std']] - )))) - -ambient_2005_stdev <- - remove_extra_columns(add_root_stdev(add_shell_stdev(convert_time( - soyface_biomass[['ambient_2005_std']] - )))) + # Return the processed table + data_table +} ``` ## The Data-Driver Pairs @@ -224,14 +172,14 @@ observed biomass, and the weight to assign to each year: # Define the data-driver pairs data_driver_pairs <- list( ambient_2002 = list( - data = ambient_2002_biomass, - data_stdev = ambient_2002_stdev, + data = process_table(soyface_biomass[['ambient_2002']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']], 'stdev'), drivers = BioCro::soybean_weather[['2002']], weight = 1 ), ambient_2005 = list( - data = ambient_2005_biomass, - data_stdev = ambient_2005_stdev, + data = process_table(soyface_biomass[['ambient_2005']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']], 'stdev'), drivers = BioCro::soybean_weather[['2005']], weight = 1 ) @@ -691,7 +639,7 @@ percent larger than with the default parameter values. The optimized parameter values could likely be improved by using a more stringent tolerance for the optimizer, but this would require more time to run. -# Comparing Model Outputs +## Comparing Model Outputs Another way to evaluate the results of the optimization is to compare simulations using the default, perturbed, and re-optimized versions of the @@ -767,10 +715,10 @@ xyplot( ) ``` -Here we can see that while the simulated stem, root, and grain (seed) biomass -values do not differ much between the models, there are large differences in the -leaf mass, where the default and re-optimized versions are similar and the -perturbed version predicts much smaller values. +Here we can see that while the simulated values for some tissues do not differ +much between the models, there are large differences in other tissues; for these +cases, the default and re-optimized versions are similar and the perturbed +version is much different. ## Saving the New Model Definition @@ -820,19 +768,7 @@ writeLines(r_cmd_string, './soybean_reparam.R') <> ### -### Define helping functions -### - -<> - -<> - -<> - -<> - -### -### Prepare inputs for `objective_function` and create an objective function +### Prepare inputs for `objective_function` ### <> @@ -853,6 +789,10 @@ writeLines(r_cmd_string, './soybean_reparam.R') <> +### +### Create the objective function +### + <> ### From fbeedeb7d58f91fca14e56ab16f06c3d77e412db Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 19:06:19 -0500 Subject: [PATCH 12/13] A little more error checking --- R/objective_function_input_checks.R | 13 ++++++++- tests/testthat/test-objective_function.R | 34 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R index dd37fc9..7a7a5a0 100644 --- a/R/objective_function_input_checks.R +++ b/R/objective_function_input_checks.R @@ -245,8 +245,19 @@ check_runner_results <- function( stop(msg) } + # Make sure each runner produces a data frame + is_df <- sapply(initial_runner_res, is.data.frame) + + if (any(!is_df)) { + msg <- paste( + 'Some runners did not produce data frames:', + paste(names(initial_runner_res)[!is_df], collapse = ', ') + ) + stop(msg) + } + # Make sure each runner produces the necessary columns in its output - expected_columns <- as.character(full_data_definitions) + expected_columns <- c('time', as.character(full_data_definitions)) missing_columns <- lapply(initial_runner_res, function(res) { expected_columns[!expected_columns %in% colnames(res)] diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R index 91bdf60..6fee66a 100644 --- a/tests/testthat/test-objective_function.R +++ b/tests/testthat/test-objective_function.R @@ -410,6 +410,40 @@ test_that('Bad return values are detected', { ), 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' ) + + # A post-processing function removes the `time` column + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, { + Pod = Grain + Shell + time = NULL + })}, + verbose_startup = verbose_startup + ), + 'Some data columns were missing from runner outputs: +ambient_2002: time +ambient_2005: time', + fixed = TRUE + ) + + # A post-processing function doesn't return a data frame + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {1.0}, + verbose_startup = verbose_startup + ), + 'Some runners did not produce data frames: ambient_2002, ambient_2005' + ) }) test_that('Bad regularization methods are detected', { From d89fe452933c2d02234b544643bea743dd2ae1e4 Mon Sep 17 00:00:00 2001 From: eloch216 <48919455+eloch216@users.noreply.github.com> Date: Thu, 22 May 2025 19:13:15 -0500 Subject: [PATCH 13/13] Fix silly mistake in DESCRIPTION --- DESCRIPTION | 3 --- 1 file changed, 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4dfa230..720c39c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,9 +8,6 @@ Authors@R: c( comment = c(ORCID = "0000-0002-4912-9783")), person("BioCroField authors", role = "cph") ) -License: MIT + file LICENSE -Encoding: UTF-8 -LazyData: true Depends: R (>= 3.6.0) Imports: