diff --git a/.Rbuildignore b/.Rbuildignore index 3c48ecc..69c5505 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,6 +4,9 @@ # reason, there should not be any blank comment lines, because they would # exclude any files with `#` in the filename, possibly causing confusing issues. +^Rhistory$ +# History files + ^.*\.Rproj$ # Designates the directory as an RStudio Project. diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 94b56af..acfefe6 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -25,7 +25,7 @@ jobs: - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} - - {os: ubuntu-latest, r: '4.0.0'} + - {os: ubuntu-latest, r: '3.6.0'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index ef57a21..0e388e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,27 @@ Package: BioCroValidation -Version: 0.1.0 +Version: 0.2.0 Title: Tools for Validating BioCro Models Description: A collection of tools for validating BioCro crop growth models. Authors@R: c( person("Edward B.", "Lochocki", role = c('cre', 'aut'), email = "eloch@illinois.edu", comment = c(ORCID = "0000-0002-4912-9783")), - person("BioCroField authors", role = "cph") + person("BioCroValidation authors", role = "cph") ) +Depends: + R (>= 3.6.0) +Imports: + BioCro (>= 3.2.0) +Suggests: + dfoptim, + DEoptim, + lattice, + knitr, + rmarkdown, + testthat (>= 3.0.0) +VignetteBuilder: knitr License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -Depends: - R (>= 4.0.0) -Suggests: - testthat (>= 3.0.0), - BioCro (>= 3.0.0) -URL: https://github.com/BioCro/BioCroField, https://biocro.github.io/BioCroValidation/ +URL: https://github.com/BioCro/BioCroValidation, https://biocro.github.io/BioCroValidation/ Config/testthat/edition: 3 diff --git a/LICENSE b/LICENSE index 2826856..5d895d6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ YEAR: 2025 -COPYRIGHT HOLDER: BioCroField authors +COPYRIGHT HOLDER: BioCroValidation authors diff --git a/LICENSE.md b/LICENSE.md index e13eb07..190d2df 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2025 BioCroField authors +Copyright (c) 2025 BioCroValidation authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/NAMESPACE b/NAMESPACE index eb1ea50..6c90842 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,4 @@ +export(bounds_table) +export(objective_function) +export(update_model) export(write_model) diff --git a/NEWS.md b/NEWS.md index a074d08..45cd934 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,10 +33,22 @@ In the case of a hotfix, a short section headed by the new release number should be directly added to this file to describe the related changes. --> +# Changes in BioCroValidation Version 0.2.0 (2025-05-23) + +- Added 2002 and 2005 SoyFACE biomass and standard deviation data. + +- Added several new functions: `objective_function`, `update_model`, and + `bounds_table`. + +- Added two new vignettes: a "Getting Started" article (`BioCroValidation.Rmd`) + and a user guide illustrating how to perform a model parameterization + (`parameterizing_soybean_biocro.Rmd`). + # Changes in BioCroValidation Version 0.1.0 - This is the first version of BioCroValidation. At this point, the package is in a state of rapid development, and not all changes will be described here. + - We are reserving version `1.0.0` for a more stable and complete future release; until then, major changes should only increase the minor version number. diff --git a/R/bounds_table.R b/R/bounds_table.R new file mode 100644 index 0000000..2a7e407 --- /dev/null +++ b/R/bounds_table.R @@ -0,0 +1,154 @@ +# A helper function for checking the bounds list for mistakes; if an issue is +# found, this function will throw an error; otherwise it will be silent with no +# return value. +check_bounds_list <- function(bounds_list, independent_args) { + # Must be a list of named elements + if (!is.list(bounds_list) | is.null(names(bounds_list))) { + stop('`bounds_list` must be a list of named elements') + } + + # Must contain all elements in independent_args + missing_element <- sapply(names(independent_args), function(x) { + !x %in% names(bounds_list) + }) + + if (any(missing_element)) { + msg <- paste0( + 'The following elements were included in ', + '`independent_args` but not `bounds_list`: ', + paste(names(independent_args)[missing_element], collapse = ', ') + ) + stop(msg) + } + + # Each element must have length 2 + length_two <- sapply(bounds_list, function(x) { + xlen <- length(x) + + if (is.finite(xlen)) { + length(x) == 2 + } else { + FALSE + } + }) + + if (any(!length_two)) { + msg <- paste0( + 'The following elements of `bounds_list` do not have a length of 2: ', + paste(names(bounds_list)[!length_two], collapse = ', ') + ) + stop(msg) + } + + # Each element must be numeric + not_numeric <- sapply(bounds_list, function(x) {!is.numeric(x)}) + + if (any(not_numeric)) { + msg <- paste0( + 'The following elements of `bounds_list` are not numeric: ', + paste(names(bounds_list)[not_numeric], collapse = ', ') + ) + stop(msg) + } + + return(invisible(NULL)) +} + +# A helper function for checking the initial guess for mistakes; if an issue is +# found, this function will throw an error or a warning; otherwise it will be +# silent with no return value. +check_initial_ind_arg_values <- function( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values +) +{ + # Check the length + if (length(initial_ind_arg_values) != length(independent_args)) { + stop('`initial_ind_arg_values` must have the same length as `independent_args`') + } + + # Check to make sure the initial values are not outside the bounds + outside_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + initial_ind_arg_values[i] < lbounds[i] | initial_ind_arg_values[i] > ubounds[i] + }) + + if (any(outside_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie outside the bounds: ', + paste(names(independent_args)[outside_bounds], collapse = ', ') + ) + stop(msg) + } + + # Check to see if any initial values are on the bounds + eps <- sqrt(.Machine$double.eps) + + on_bounds <- sapply(seq_along(initial_ind_arg_values), function(i) { + abs(initial_ind_arg_values[i] - lbounds[i]) < eps | + abs(initial_ind_arg_values[i] - ubounds[i]) < eps + }) + + if (any(on_bounds)) { + msg <- paste0( + 'The initial values for the following arguments lie on the ', + 'bounds, which can be problematic for some optimizers: ', + paste(names(independent_args)[on_bounds], collapse = ', ') + ) + warning(msg) + } + + return(invisible(NULL)) +} + +bounds_table <- function( + independent_args, + bounds_list, + initial_ind_arg_values = NULL +) +{ + # Check the bounds_list + check_bounds_list(bounds_list, independent_args) + + # Get an ordering for the elements of `bounds_list` so they match the order + # of elements in `independent_args`; note that this will also exclude any + # elements of `bounds_list` that are not included in `independent_args`. + ordering <- match( + names(independent_args), + names(bounds_list) + ) + + bounds_list <- bounds_list[ordering] + + # Get the lower and upper bounds + lbounds <- sapply(bounds_list, min) + ubounds <- sapply(bounds_list, max) + + # Form the bounds table + bounds_table <- data.frame( + arg_name = names(independent_args), + lower = lbounds, + upper = ubounds, + stringsAsFactors = FALSE + ) + + # Include initial values in the table if they were provided + if (!is.null(initial_ind_arg_values)) { + # Check the values + check_initial_ind_arg_values( + independent_args, + lbounds, + ubounds, + initial_ind_arg_values + ) + + # Include the values + bounds_table$initial_value <- initial_ind_arg_values + } + + # Remove row names and return the table + rownames(bounds_table) <- NULL + + bounds_table +} diff --git a/R/objective_function.R b/R/objective_function.R new file mode 100644 index 0000000..6c21e00 --- /dev/null +++ b/R/objective_function.R @@ -0,0 +1,134 @@ +## Here we use several functions that are defined in +## `objective_function_input_checks.R` and `objective_function_helpers.R` + +objective_function <- function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = list(), + normalization_method = 'mean_max', + normalization_param = NULL, + stdev_weight_method = 'equal', + stdev_weight_param = NULL, + regularization_method = 'none', + dependent_arg_function = NULL, + post_process_function = NULL, + extra_penalty_function = NULL, + verbose_startup = TRUE +) +{ + # Check the data-driver pairs + check_data_driver_pairs(base_model_definition, data_driver_pairs) + + # Check the arguments to be varied + check_args_to_vary( + independent_args, + dependent_arg_function, + data_driver_pairs, + verbose_startup + ) + + # Get the model runners + model_runners <- lapply(data_driver_pairs, function(ddp) { + get_model_runner( + base_model_definition, + independent_args, + dependent_arg_function, + post_process_function, + ddp + ) + }) + + # Get the full data definition list + full_data_definitions <- + get_data_definition_list(data_driver_pairs, data_definitions) + + # Print the full data definition list, if desired + if (verbose_startup) { + cat('\nThe full data definitions:\n\n') + utils::str(full_data_definitions) + } + + # Check the model runners + check_runners(model_runners) + + # Get initial model runner results + initial_runner_res <- + get_initial_runner_res(model_runners, independent_args) + + # Check the initial model runner results + check_runner_results( + initial_runner_res, + full_data_definitions, + data_driver_pairs + ) + + # Get the long-form data + long_form_data <- + get_long_form_data(data_driver_pairs, full_data_definitions) + + # Find indices corresponding to the measured time points + long_form_data <- add_time_indices(initial_runner_res, long_form_data) + + # Add normalization factors + long_form_data <- add_norm( + long_form_data, + normalization_method, + normalization_param, + length(data_driver_pairs) + ) + + # Add variance-based weights + long_form_data <- add_w_var( + long_form_data, + stdev_weight_method, + stdev_weight_param + ) + + # Print the long form data, if desired. Do this before checking the data, + # so the printout will be available for troubleshooting + if (verbose_startup) { + cat('\nThe user-supplied data in long form:\n\n') + print(long_form_data) + } + + # Check the processed long-form data + check_long_form_data(long_form_data) + + # Process the quantity weights + full_quantity_weights <- + process_quantity_weights(quantity_weights, long_form_data) + + # Print the quantity weights, if desired + if (verbose_startup) { + cat('The user-supplied quantity weights:\n\n') + utils::str(full_quantity_weights) + } + + # Get the data-driver pair weights + ddp_weights <- get_ddp_weights(data_driver_pairs) + + # Print the data-driver pair weights, if desired + if (verbose_startup) { + cat('\nThe user-supplied data-driver pair weights:\n\n') + utils::str(ddp_weights) + } + + # Create the objective function + obj_fun <- get_obj_fun( + model_runners, + long_form_data, + full_quantity_weights, + ddp_weights, + normalization_method, + extra_penalty_function, + regularization_method + ) + + # Check the objective function + check_obj_fun(obj_fun, independent_args, verbose_startup) + + # Return it + obj_fun +} diff --git a/R/objective_function_helpers.R b/R/objective_function_helpers.R new file mode 100644 index 0000000..ae5ef09 --- /dev/null +++ b/R/objective_function_helpers.R @@ -0,0 +1,492 @@ +## All the functions defined in this file are intended to perform key operations +## required by `objective_function`. + +# Value to return when a simulation fails to run +FAILURE_VALUE <- 1e10 + +# Helping function for getting a full list of argument names +get_full_arg_names <- function(independent_args, dependent_arg_function) { + # Get the independent argument names + independent_arg_names <- names(independent_args) + + # Get the full list of arg_names + if (is.null(dependent_arg_function)) { + independent_arg_names + } else { + dependent_arg_values <- + dependent_arg_function(independent_args) + + c(independent_arg_names, names(dependent_arg_values)) + } +} + +# Helping function for getting a model runner; if the runner cannot be created, +# an error message will be returned instead +get_model_runner <- function( + base_model_definition, + independent_args, + dependent_arg_function, + post_process_function, + ddp +) +{ + # Get the independent argument names + independent_arg_names <- names(independent_args) + + # Get the full list of arg_names + arg_names <- get_full_arg_names(independent_args, dependent_arg_function) + + # Build the runner + tryCatch({ + partial_func <- BioCro::partial_run_biocro( + base_model_definition[['initial_values']], + base_model_definition[['parameters']], + ddp[['drivers']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], + base_model_definition[['ode_solver']], + arg_names + ) + + function(x) { + if (!is.numeric(x)) { + stop('`x` must be numeric') + } + + x_for_partial <- if (is.null(dependent_arg_function)) { + x + } else { + x_for_dep_arg_func <- + stats::setNames(x, independent_arg_names) + + c(x, as.numeric(dependent_arg_function(x_for_dep_arg_func))) + } + + if (any(!is.finite(x_for_partial))) { + stop( + 'At least one independent or dependent argument ', + 'value is not finite' + ) + } + + initial_res <- partial_func(x_for_partial) + + if (is.null(post_process_function)) { + initial_res + } else { + post_process_function(initial_res) + } + } + }, + error = function(e) {as.character(e)} + ) +} + +# Helping function for running each runner with the initial argument values +get_initial_runner_res <- function(model_runners, independent_args) { + lapply(model_runners, function(runner) { + tryCatch( + runner(as.numeric(independent_args)), + error = function(e) {as.character(e)} + ) + }) +} + +# Helping function for getting a "data definition list," which specifies the +# names of the `data` columns as they appear in the simulation output +get_data_definition_list <- function(data_driver_pairs, user_data_definitions) +{ + # First get all the column names found in the observed data + all_data_colnames <- + lapply(data_driver_pairs, function(x) {colnames(x[['data']])}) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- all_data_colnames[!duplicated(all_data_colnames)] + + # Remove the `time` column + all_data_colnames <- all_data_colnames[all_data_colnames != 'time'] + + # Build the data definition list + data_definitions <- lapply(all_data_colnames, function(cn) { + if (cn %in% names(user_data_definitions)) { + user_data_definitions[[cn]] + } else { + cn + } + }) + + names(data_definitions) <- all_data_colnames + + data_definitions +} + +# Helping function for converting each data table to a "long form," including +# stdev values when available +get_long_form_data <- function(data_driver_pairs, full_data_definitions) { + lapply(data_driver_pairs, function(ddp) { + short_form_data <- ddp[['data']] + + has_std <- 'data_stdev' %in% names(ddp) + + short_form_stdev <- if (has_std) { + ddp[['data_stdev']] + } else { + NA + } + + data_column_names <- colnames(short_form_data) + data_column_names <- data_column_names[data_column_names != 'time'] + + long_form_data_list <- lapply(data_column_names, function(cn) { + data.frame( + time = short_form_data[, 'time'], + quantity_name = full_data_definitions[[cn]], + quantity_value = short_form_data[, cn], + quantity_stdev = if (has_std) {short_form_stdev[, cn]} else {1}, + stringsAsFactors = FALSE + ) + }) + + long_form_data <- do.call(rbind, long_form_data_list) + + long_form_data[!is.na(long_form_data[['quantity_value']]), ] + }) +} + +# Helping function for getting time indices +add_time_indices <- function(initial_runner_res, long_form_data) { + for (i in seq_along(long_form_data)) { + res <- initial_runner_res[[i]] + dataf <- long_form_data[[i]] + + indices <- sapply(dataf[, 'time'], function(x) { + tdiff <- abs(res[, 'time'] - x) + + # Take only the first match, in case there are more + which(tdiff == min(tdiff))[1] + }) + + long_form_data[[i]][, 'time_index'] <- indices + long_form_data[[i]][, 'expected_npts'] <- nrow(res) + } + + long_form_data +} + +# Helping function for getting a parameter value that has a default +get_param_default <- function(param, default) { + if (is.null(param)) { + default + } else { + param + } +} + +# Helping function for getting normalization factors +add_norm <- function( + long_form_data, + normalization_method, + normalization_param, + n_ddp +) +{ + for (i in seq_along(long_form_data)) { + data_table <- long_form_data[[i]] + + data_table[['norm']] <- sapply(seq_len(nrow(data_table)), function(j) { + qname <- data_table[j, 'quantity_name'] + + qname_subset <- + data_table[data_table[['quantity_name']] == qname, ] + + if (tolower(normalization_method) == 'equal') { + 1.0 + } else if (tolower(normalization_method) == 'mean') { + nrow(qname_subset) * n_ddp + } else if (tolower(normalization_method) == 'max') { + max(qname_subset[['quantity_value']])^2 + } else if (tolower(normalization_method) == 'obs') { + eps <- get_param_default(normalization_param, 1e-1) + data_table[i, 'quantity_value']^2 + eps + } else if (tolower(normalization_method) == 'mean_max') { + npts <- nrow(qname_subset) + qmax <- max(qname_subset[['quantity_value']]) + npts * n_ddp * qmax^2 + } else if (tolower(normalization_method) == 'mean_obs') { + npts <- nrow(qname_subset) + eps <- get_param_default(normalization_param, 1e-1) + npts * n_ddp * (data_table[i, 'quantity_value']^2 + eps) + } else { + stop('Unsupported normalization_method: ', normalization_method) + } + }) + + long_form_data[[i]] <- data_table + } + + long_form_data +} + +# Helping function for getting variance-based weights +add_w_var <- function(long_form_data, stdev_weight_method, stdev_weight_param) { + for (i in seq_along(long_form_data)) { + data_table <- long_form_data[[i]] + data_stdev <- data_table[['quantity_stdev']] + + data_table[['w_var']] <- + if (tolower(stdev_weight_method) == 'equal') { + 1.0 + } else if (tolower(stdev_weight_method) == 'logarithm') { + eps <- get_param_default(stdev_weight_param, 1e-5) + log(1 / (data_stdev + eps)) + } else if (tolower(stdev_weight_method) == 'inverse') { + eps <- get_param_default(stdev_weight_param, 1e-1) + 1 / (data_stdev^2 + eps) + } else { + stop('Unsupported stdev_weight_method: ', stdev_weight_method) + } + + long_form_data[[i]] <- data_table + } + + long_form_data +} + +# Helping function that processes and checks the quantity weights +process_quantity_weights <- function(quantity_weights, long_form_data) { + # First make sure that weights have been provided for all measured + # quantities + all_data_colnames <- lapply(long_form_data, function(x) { + unique(x[, 'quantity_name']) + }) + + all_data_colnames <- unlist(all_data_colnames) + + all_data_colnames <- unique(all_data_colnames) + + weight_was_supplied <- sapply(all_data_colnames, function(cn) { + cn %in% names(quantity_weights) + }) + + if (any(!weight_was_supplied)) { + missing_weights <- all_data_colnames[!weight_was_supplied] + + msg <- paste( + 'Weights were not supplied for the following measured quantities:', + paste(missing_weights, collapse = ', ') + ) + + stop(msg) + } + + # Now make sure all the weights have length 2 + lapply(quantity_weights, function(wt) { + rep_len(wt, 2) + }) +} + +# Helping function for getting the data-driver pair weights +get_ddp_weights <- function(data_driver_pairs) { + lapply(data_driver_pairs, function(ddp) { + ddp[['weight']] + }) +} + +# Helping function that calculates one error +one_error <- function( + observed, + predicted, + quantity_weight, + ddp_weight, + var_weight, + normalization +) +{ + if (!is.finite(predicted)) { + NA + } else { + qw <- if (predicted < observed) { + quantity_weight[1] # Underprediction + } else { + quantity_weight[2] # Overprediction + } + + (observed - predicted)^2 * qw * ddp_weight * var_weight / normalization + } +} + +# Helping function for returning a failure value +failure_value <- function(error_sum, return_terms) { + if (return_terms) { + list( + least_squares_term = error_sum, + extra_penalty = FAILURE_VALUE + ) + } else { + FAILURE_VALUE + } +} + +# Helping function that calculates an error value from a simulation result +error_from_res <- function( + simulation_result, + long_form_data_table, + quantity_weights, + ddp_weight, + normalization_method, + extra_penalty_function, + return_terms +) +{ + # If the simulation did not finish, return a very high value + expected_npts <- long_form_data_table[1, 'expected_npts'] + + if (nrow(simulation_result) < expected_npts) { + return( + failure_value(NA, return_terms) + ) + } + + # Calculate any user-specified penalties + penalty <- if (is.null(extra_penalty_function)) { + 0.0 + } else { + extra_penalty_function(simulation_result) + } + + # Calculate the error terms + n_obs <- nrow(long_form_data_table) + + errors <- sapply(seq_len(n_obs), function(i) { + qname <- as.character(long_form_data_table[i, 'quantity_name']) + indx <- long_form_data_table[i, 'time_index'] + + one_error( + long_form_data_table[i, 'quantity_value'], # obs + simulation_result[indx, qname], # pred + quantity_weights[[qname]], # quantity_weight + ddp_weight, # ddp_weight + long_form_data_table[i, 'w_var'], # var_weight + long_form_data_table[i, 'norm'] # norm + ) + }) + + error_sum <- sum(errors) + + # If the error sum is not finite, return a very high value + if (!is.finite(error_sum)) { + return( + failure_value(error_sum, return_terms) + ) + } + + # Return the sum of the penalty and error terms, or the individual errors + if (return_terms) { + error_terms_by_quantity <- as.list(tapply( + errors, + long_form_data_table[['quantity_name']], + sum + )) + + list( + least_squares_terms = error_terms_by_quantity, + extra_penalty = penalty + ) + } else { + penalty + error_sum + } +} + +# Helping function for calculating a regularization penalty term +regularization_penalty <- function( + ind_arg_vals, + regularization_method, + regularization_lambda +) +{ + if (toupper(regularization_method) == 'NONE') { + 0.0 + } else if (toupper(regularization_method) == 'LASSO' || toupper(regularization_method) == 'L1') { + regularization_lambda * sum(abs(ind_arg_vals)) + } else if (toupper(regularization_method) == 'RIDGE' || toupper(regularization_method) == 'L2') { + regularization_lambda * sum(ind_arg_vals^2) + } else { + stop('Unsupported regularization method: ', regularization_method) + } +} + +# Helping function that forms the overall objective function +get_obj_fun <- function( + model_runners, + long_form_data, + full_quantity_weights, + ddp_weights, + normalization_method, + extra_penalty_function, + regularization_method +) +{ + function(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE) { + if (debug_mode) { + msg <- paste0( + '\nTime: ', + Sys.time(), + ' Independent argument values: ', + paste(x, collapse = ', '), + '\n' + ) + cat(msg) + } + + errors <- lapply(seq_along(model_runners), function(i) { + runner <- model_runners[[i]] + res <- runner(x) + + error_from_res( + res, + long_form_data[[i]], + full_quantity_weights, + ddp_weights[[i]], + normalization_method, + extra_penalty_function, + return_terms + ) + }) + + reg_penalty <- regularization_penalty(x, regularization_method, lambda) + + if (return_terms) { + error_metric_terms <- list( + terms_from_data_driver_pairs = stats::setNames( + errors, + names(model_runners) + ), + regularization_penalty = reg_penalty + ) + + if (debug_mode) { + cat(paste0('Time: ', Sys.time()), ' Error metric terms: ') + utils::str(error_metric_terms) + cat('\n') + } + + error_metric_terms + } else { + error_metric <- sum(as.numeric(errors)) + reg_penalty + + if (debug_mode) { + msg <- paste0( + 'Time: ', + Sys.time(), + ' Error metric: ', + error_metric, + '\n' + ) + cat(msg) + } + + error_metric + } + } +} diff --git a/R/objective_function_input_checks.R b/R/objective_function_input_checks.R new file mode 100644 index 0000000..7a7a5a0 --- /dev/null +++ b/R/objective_function_input_checks.R @@ -0,0 +1,445 @@ +## All the functions defined in this file are intended to check inputs for +## certain errors. Each function will throw an error if a problem is detected, +## and will otherwise be silent with no return value. + +# Helping function for checking the data-driver pairs; will throw an error if +# a problem is detected, and will otherwise be silent with no return value. +check_data_driver_pairs <- function(base_model_definition, data_driver_pairs) { + # There must be at least one data-driver pair + if (length(data_driver_pairs) < 1) { + stop('`data_driver_pairs` must have at least one element') + } + + # Data-driver pairs must have names + if (is.null(names(data_driver_pairs))) { + stop('`data_driver_pairs` must have names') + } + + # Each data-driver pair must have the required elements + required_elements <- c('drivers', 'data', 'weight') + + has_elements <- sapply(data_driver_pairs, function(x) { + all(required_elements %in% names(x)) + }) + + if (any(!has_elements)) { + missing_elements <- names(data_driver_pairs)[!has_elements] + + msg <- paste0( + 'The following data-driver pairs are missing at least one ', + 'required element (', + paste(required_elements, collapse = ', '), + '): ', + paste(missing_elements, collapse = ', ') + ) + + stop(msg) + } + + # Only required or optional elements should be provided + optional_elements <- 'data_stdev' + + acceptable_elements <- c(required_elements, optional_elements) + + has_extra_elements <- sapply(data_driver_pairs, function(x) { + any(!names(x) %in% acceptable_elements) + }) + + if (any(has_extra_elements)) { + bad_elements <- names(data_driver_pairs)[has_extra_elements] + + msg <- paste0( + 'The following data-driver pairs have unexpected elements: ', + paste(bad_elements, collapse = ', '), + '. The allowed elements are: ', + paste(acceptable_elements, collapse = ', '), + '.' + ) + + stop(msg) + } + + # Each data table must have a time column + has_time <- sapply(data_driver_pairs, function(x) { + 'time' %in% colnames(x[['data']]) + }) + + if (any(!has_time)) { + missing_time <- names(data_driver_pairs)[!has_time] + + msg <- paste( + 'The following data-driver pairs are missing a `time` column', + 'in their `data` element:', + paste(missing_time, collapse = ', ') + ) + + stop(msg) + } + + # If provided, stdev tables must have the same columns and time values as + # their corresponding data tables. Time values must also be in the same + # order. + stdev_okay <- sapply(data_driver_pairs, function(x) { + if ('data_stdev' %in% names(x)) { + data_table <- x[['data']] + stdev_table <- x[['data_stdev']] + + if (is.null(colnames(stdev_table))) { + FALSE + } else { + colnames_match <- identical( + sort(colnames(data_table)), + sort(colnames(stdev_table)) + ) + + times_match <- isTRUE(all.equal( + data_table[['time']], + stdev_table[['time']] + )) + + colnames_match && times_match + } + } else { + TRUE + } + }) + + if (any(!stdev_okay)) { + bad_stdev <- names(data_driver_pairs)[!stdev_okay] + + msg <- paste( + 'The following data-driver pairs have a `data_stdev` element', + 'that does not match the columns and/or times of their', + '`data` element:', + paste(bad_stdev, collapse = ', ') + ) + + stop(msg) + } + + # Each set of drivers must form a valid dynamical system along with the + # base model definition + valid_definitions <- sapply(data_driver_pairs, function(ddp) { + BioCro::validate_dynamical_system_inputs( + base_model_definition[['initial_values']], + base_model_definition[['parameters']], + ddp[['drivers']], + base_model_definition[['direct_modules']], + base_model_definition[['differential_modules']], + verbose = FALSE + ) + }) + + if (any(!valid_definitions)) { + invalid_ddp <- names(data_driver_pairs)[!valid_definitions] + + msg <- paste( + 'The following drivers did not form a valid dynamical system:', + paste(invalid_ddp, collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the independent arguments +check_args_to_vary <- function( + independent_args, + dependent_arg_function, + data_driver_pairs, + verbose +) +{ + # Make sure the independent arguments have names + ind_arg_names <- names(independent_args) + + if (is.null(ind_arg_names)) { + stop('`independent_args` must have names') + } + + # Make sure the dependent argument function returns a named list + if (!is.null(dependent_arg_function)) { + dep_arg_names <- + names(dependent_arg_function(independent_args)) + + if (is.null(dep_arg_names)) { + stop('The return value of `dependent_arg_function` must have names') + } + } + + # Print argument names, if necessary + if (verbose) { + cat('\nThe independent arguments and their initial values:\n\n') + utils::str(independent_args) + + cat('\nThe dependent arguments and their initial values:\n\n') + utils::str(dependent_arg_function(independent_args)) + } + + # Make sure no drivers were specified + arg_names <- get_full_arg_names(independent_args, dependent_arg_function) + + args_in_drivers <- lapply(data_driver_pairs, function(ddp) { + driver_names <- names(ddp[['drivers']]) + arg_names[arg_names %in% driver_names] + }) + + args_in_drivers <- unique(unlist(args_in_drivers)) + + if (length(args_in_drivers) > 0) { + msg <- paste( + 'Some independent or dependent argument names refer to columns', + 'in the drivers:', + paste(args_in_drivers, collapse = ', ') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the model runners; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_runners <- function(model_runners) { + bad_runners <- sapply(model_runners, is.character) + + if (any(bad_runners)) { + bad_runner_names <- names(model_runners)[bad_runners] + bad_runner_msg <- as.character(model_runners[bad_runners]) + + msg <- paste0( + 'Model runners could not be created for the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the initial model runner results; will throw an +# error if a problem is detected, and will otherwise be silent with no return +# value. +check_runner_results <- function( + initial_runner_res, + full_data_definitions, + data_driver_pairs +) +{ + # Check for runners that could not be evaluated + bad_res <- sapply(initial_runner_res, is.character) + + if (any(bad_res)) { + bad_runner_names <- names(initial_runner_res)[bad_res] + bad_runner_msg <- initial_runner_res[bad_res] + + msg <- paste0( + 'The model could not be run with the following drivers:\n', + paste0(bad_runner_names, ': ', bad_runner_msg, collapse = '') + ) + + stop(msg) + } + + # Make sure each runner produces a data frame + is_df <- sapply(initial_runner_res, is.data.frame) + + if (any(!is_df)) { + msg <- paste( + 'Some runners did not produce data frames:', + paste(names(initial_runner_res)[!is_df], collapse = ', ') + ) + stop(msg) + } + + # Make sure each runner produces the necessary columns in its output + expected_columns <- c('time', as.character(full_data_definitions)) + + missing_columns <- lapply(initial_runner_res, function(res) { + expected_columns[!expected_columns %in% colnames(res)] + }) + + bad_outputs <- sapply(missing_columns, function(x) { + length(x) > 0 + }) + + if (any(bad_outputs)) { + msg <- 'Some data columns were missing from runner outputs:' + + for (i in seq_along(bad_outputs)) { + if (bad_outputs[i]) { + msg <- append( + msg, + paste0( + names(initial_runner_res)[i], ': ', + paste(missing_columns[[i]], collapse = ', ') + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + + # Make sure the output from each runner includes the observed times + times_out_of_range <- lapply(seq_along(initial_runner_res), function(i) { + res <- initial_runner_res[[i]] + + min_time <- min(res[['time']]) + max_time <- max(res[['time']]) + + data_times <- data_driver_pairs[[i]][['data']][['time']] + + oor <- sapply(data_times, function(datat) { + datat < min_time || datat > max_time + }) + + c(min_time, max_time, data_times[oor]) + }) + + bad_times <- sapply(times_out_of_range, function(x) { + length(x) > 2 + }) + + if (any(bad_times)) { + msg <- 'Some observed times were missing from runner outputs:' + + for (i in seq_along(bad_times)) { + if (bad_times[i]) { + raw_times_oor <- times_out_of_range[[i]] + + min_time <- raw_times_oor[1] + max_time <- raw_times_oor[2] + time_oor <- raw_times_oor[seq(3, length(raw_times_oor))] + + msg <- append( + msg, + paste0( + names(initial_runner_res)[i], ': ', + paste(time_oor, collapse = ', '), + ' (min_time = ', min_time, + ', max_time = ', max_time, ')' + ) + ) + } + } + + stop(paste(msg, collapse = '\n')) + } + + return(invisible(NULL)) +} + +# Helping function for checking the long-form data; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_long_form_data <- function(long_form_data) { + # Check each element for issues + messages <- sapply(long_form_data, function(lfd) { + msg <- character() + + # Check that certain columns have finite values + check_for_not_finite <- c( + 'time', + 'quantity_value', + 'quantity_stdev', + 'time_index', + 'expected_npts', + 'norm', + 'w_var' + ) + + not_finite <- sapply(check_for_not_finite, function(cn) { + any(!is.finite(lfd[, cn])) + }) + + if (any(not_finite)) { + not_finite_col <- check_for_not_finite[not_finite] + + new_msg <- paste( + 'The following columns contained non-finite values:', + paste(not_finite_col, collapse = ', ') + ) + + msg <- append(msg, new_msg) + } + + # Check that certain columns do not have negative values + check_for_negative <- c( + 'quantity_stdev', + 'time_index', + 'expected_npts', + 'norm', + 'w_var' + ) + + negative <- sapply(check_for_negative, function(cn) { + any(lfd[, cn] < 0) + }) + + if (any(negative)) { + negative_col <- check_for_negative[negative] + + new_msg <- paste( + 'The following columns contained negative values:', + paste(negative_col, collapse = ', ') + ) + + msg <- append(msg, new_msg) + } + + # Return any messages + paste(msg, collapse = '\n ') + }) + + # Send a message if problems were detected + error_found <- messages != '' + + if (any(error_found)) { + data_names <- names(long_form_data)[error_found] + error_messages <- messages[error_found] + + msg <- paste0( + 'Issues were found with the following data sets:\n ', + paste0( + data_names, ':\n ', + error_messages, + collapse = '\n ' + ) + ) + + stop(msg) + } + + return(invisible(NULL)) +} + +# Helping function for checking the objective function; will throw an error if a +# problem is detected, and will otherwise be silent with no return value. +check_obj_fun <- function(obj_fun, initial_ind_arg_values, verbose) { + initial_error_terms <- + obj_fun(as.numeric(initial_ind_arg_values), return_terms = TRUE) + + initial_error <- sum(unlist(initial_error_terms)) + + if (verbose) { + cat('\nThe initial error metric terms:\n\n') + utils::str(initial_error_terms) + + cat('\nThe initial error metric value:\n\n') + print(initial_error) + } + + if (!is.finite(initial_error)) { + stop( + 'The objective function did not return a finite value when using ', + 'the initial argument values; instead, it returned: ', + initial_error + ) + } + + return(invisible(NULL)) +} diff --git a/R/update_model.R b/R/update_model.R new file mode 100644 index 0000000..1a5d32a --- /dev/null +++ b/R/update_model.R @@ -0,0 +1,80 @@ +update_model <- function( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL +) +{ + # Make sure the model definition has initial_values and parameters + req_elements <- c('initial_values', 'parameters') + if (!all(req_elements %in% names(base_model_definition))) { + stop( + 'The `base_model_definition` must have the following elements: ', + paste(req_elements, collapse = ', ') + ) + } + + # Make sure the argument lists have the same length + if (length(independent_args) != length(new_ind_arg_values)) { + stop('`independent_args` and `new_ind_arg_values` must have the same length') + } + + # Update the values of the independent arguments + new_independent_args <- stats::setNames( + as.list(new_ind_arg_values), + names(independent_args) + ) + + # Also get the values of the dependent arguments + all_args <- if (!is.null(dependent_arg_function)) { + c(new_independent_args, dependent_arg_function(new_independent_args)) + } else { + new_independent_args + } + + # Find all quantities in the initial values and parameters and store them in + # a list + iv <- as.list( + rep_len( + 'initial_values', + length(base_model_definition[['initial_values']]) + ) + ) + names(iv) <- names(base_model_definition[['initial_values']]) + + param <- as.list( + rep_len( + 'parameters', + length(base_model_definition[['parameters']]) + ) + ) + names(param) <- names(base_model_definition[['parameters']]) + + model_quantities <- c(iv, param) + + # Make sure each supplied argument is included in the model + not_in_model <- !names(all_args) %in% names(model_quantities) + + if (any(not_in_model)) { + msg <- paste0( + 'Values were supplied for the following quantities, but they ', + 'are not `initial_values` or `parameters` of ', + 'the `base_model_definition`: ', + paste(names(all_args)[not_in_model], collapse = ', ') + ) + stop(msg) + } + + # Make a copy of the model with the new argument values and return it + new_model_definition <- base_model_definition + + for (i in seq_along(all_args)) { + arg_name <- names(all_args)[i] + arg_type <- model_quantities[[arg_name]] + arg_value <- all_args[[i]] + + new_model_definition[[arg_type]][[arg_name]] <- arg_value + } + + new_model_definition +} diff --git a/README.md b/README.md index 2b2c91c..f6eeee0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,15 @@ remotes::install_github('BioCro/BioCroValidation') Note that this method requires the `remotes` package, which can be installed from within R by typing `install.packages('remotes')`. +### Usage + +The best way to learn about using `BioCroValidation` is to visit the +[BioCroValidation website](https://biocro.github.io/BioCroValidation/index.html) +and click the "Get Started" link in the top menu bar. The website includes +documentation for all the functions and data sets included in the package, as +well as articles that describe its general features and several important use +cases. + ### License The `BioCroValidation` R package and its documentation are licensed under the diff --git a/_pkgdown.yml b/_pkgdown.yml index 76ebccb..27d10ba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,5 +1,6 @@ url: https://biocro.github.io/BioCroValidation/ template: + math-rendering: mathjax bootstrap: 5 bootswatch: flatly theme: arrow-light diff --git a/data/soyface_biomass.R b/data/soyface_biomass.R new file mode 100644 index 0000000..dc36357 --- /dev/null +++ b/data/soyface_biomass.R @@ -0,0 +1,54 @@ +soyface_biomass <- list( + ambient_2002 = utils::read.csv( + textConnection(' + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.180284339443109,0.085244969371991,0,0,0,0 + 189,0.55446194221275,0.41885389322975,0,0,0,0 + 203,1.32655293077725,1.71106736644195,0.0003171478564925,0,0,0 + 217,1.69794400686295,2.8928258965309,0.07939632545295,0,0,0 + 231,1.80774278200725,3.6859142604218,1.55457130346238,0,0.3818897637489,0.3818897637489 + 246,1.57881364816734,3.74526071710995,3.97601356048602,2.480314960431,0.281423009601227,0.663312773350127 + 259,0.947537773327332,3.61840157451295,6.5446506556431,4.846894137844,0.315325021846977,0.978637795197104 + 288,0,2.30570122466198,7.00896762848425,5.4133858263375,1.60453886688794,2.58317666208504'), + header = TRUE + ), + ambient_2005 = utils::read.csv( + textConnection(' + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.222271875,0.188803125,0,0,0,0 + 186,0.8460375,0.85220625,0,0,0,0 + 200,1.184465625,1.61896875,0,0,0,0 + 214,2.218059375,4.043615625,0.29925,0,0.041475,0.06654375 + 228,2.147446875,4.477725,2.304553125,0,0.153628125,0.18230625 + 242,1.51948125,3.8920875,5.532778125,3.022490625,0.1157625,0.335934375 + 256,0.06575625,2.89905,5.371078125,3.998203125,0.5310375,0.866971875 + 270,0,2.1756,6.372253125,4.965646875,0.281465625,1.1484375'), + header = TRUE + ), + ambient_2002_std = utils::read.csv( + textConnection(' + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 179,0.040815550055146,0.0170797372473185,0,0,0,0 + 189,0.163863273875505,0.138449024803329,0,0,0,0 + 203,0.133774433506304,0.18371075937777,0.000549316200956573,0,0,0 + 217,0.228326657644473,0.448744065190288,0.0309899984999309,0,0,0 + 231,0.202475421469591,0.453447470747275,0.218402543503905,0,0.0456182274998721,0.0456182274998721 + 246,0.0754751653831798,0.275321356095752,0.573548828062959,0.36605836251352,0.0724761221542088,0.0676627659537074 + 259,0.344550032518756,0.151045377683595,0.743440701063986,0.514421560164731,0.0495275235053204,0.0320523860205109 + 288,0,0.148389260861146,0.141828616948454,0.115260823492553,0.275546039297454,0.267829516105734'), + header = TRUE + ), + ambient_2005_std = utils::read.csv( + textConnection(' + DOY,Leaf_Mg_per_ha,Stem_Mg_per_ha,Rep_Mg_per_ha,Seed_Mg_per_ha,Litter_Mg_per_ha,CumLitter_Mg_per_ha + 172,0.0328965891267146,0.0143181358308921,0,0,0,0 + 186,0.146798299121618,0.19883006141002,0,0,0,0 + 200,0.338074287830052,0.605286252689843,0,0,0,0 + 214,0.152175913412888,0.559874052059856,0.16427519712551,0,0.0496298580745502,0.0637084579721366 + 228,0.119077588733867,0.30674464428393,0.434148073671929,0,0.0171152969203539,0.056246865537668 + 242,0.512808698911872,0.379108485359897,0.588476975732098,0.341714781741362,0.0100661134816274,0.073342888722574 + 256,0.0616862433382638,0.220823980737487,0.520044379820051,0.398956753155868,0.179630286326179,0.214176625850131 + 270,0,0.243254729045758,0.633090857876002,0.507224090206239,0.0520489826201903,0.246267463689126'), + header = TRUE + ) +) diff --git a/man/bounds_table.Rd b/man/bounds_table.Rd new file mode 100644 index 0000000..45cd68a --- /dev/null +++ b/man/bounds_table.Rd @@ -0,0 +1,122 @@ +\name{bounds_table} + +\alias{bounds_table} + +\title{Create a table of lower and upper bounds} + +\description{ + During an optimization, it is often necessary to provide lower and upper + bounds for the parameters that are being varied. Typically they are specified + as numeric vectors, which often leads to confusing code, where the writer and + reader must remember which value corresponds to each argument; for example, + "the third element of the lower bound vector is for alphaLeaf." + + The purpose of \code{bounds_table} is to make the process of specifying bounds + simpler and easier to follow. It is expected that this function will be called + after \code{\link{objective_function}}. +} + +\usage{ + bounds_table( + independent_args, + bounds_list, + initial_ind_arg_values = NULL + ) +} + +\arguments{ + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{bounds_list}{ + A list of named elements, where each element is a numeric vector of length + 2. The names should correspond to the independent arguments, and the values + should indicate lower and upper bounds for the corresponding parameter (in + any order). Any "extra" bounds (that is, bounds that do not correspond to + any independent argument) will be ignored. + } + + \item{initial_ind_arg_values}{ + A numeric vector of initial values for each of the independent arguments, + supplied in the same order as in \code{independent_args}. + } +} + +\details{ + The main purpose of this function is to create vectors of lower and upper + bounds, which are returned as the columns of a data frame. For each + independent argument in \code{independent_args}, the bounds are supplied via + the \code{bounds_list} input. The syntax is designed so the code calling this + function is easy for a human to parse. (See example below.) + + It is also (optionally) possible to provide an initial guess for each + independent argument via the \code{initial_ind_arg_values} argument. When + provided, these will be checked to make sure they do not lie outside the + bounds; an error will be thrown if any do lie outside the bounds. A warning + will also be thrown if any initial guesses lie on the bounds, since this can + be problematic for some optimizers, such as \code{\link[dfoptim]{nmkb}}. + + Some optimizers, such as \code{\link[DEoptim]{DEoptim}}, do not require an + initial guess; in this case, there is no strong need to pass an initial guess + to \code{bounds_table}. +} + +\value{ + A data frame with three or four columns: \code{arg_name}, \code{lower}, + \code{upper}, and (optionally) \code{initial_value}. + + The \code{lower} and \code{upper} columns are the lower and upper bounds, + determined from \code{bounds_list}. The \code{arg_name} column is the argument + name, and the rows of the table are ordered as in \code{independent_args}. + The \code{initial_value} column contains initial values, if they were + provided via the \code{initial_ind_arg_values} argument. +} + +\examples{ +# Make a list of independent arguments; the values are not used for anything +independent_args <- list( + alphaLeaf = 0, + alphaRoot = 0, + alphaStem = 0, + betaLeaf = 0, + betaRoot = 0, + betaStem = 0 +) + +# Specify bounds and initial guess for each. Note that: +# +# - The bounds will be reordered to follow the same order as the +# `independent_args`, but the initial guess is assumed to already follow the +# same order as the `independent_args`. +# +# - The bounds for the two extra parameters are ignored when forming the table. +# +# - The lower and upper bounds can be supplied as (upper, lower) +# or (lower, upper) pairs. +# +b_ll <- -50 # Lower limit for beta parameters +a_ul <- 50 # Upper limit for alpha parameters + +bounds <- bounds_table( + independent_args, + list( + betaStem = c(0, b_ll), + betaRoot = c(0, b_ll), + betaLeaf = c(0, b_ll), + alphaStem = c(0, a_ul), + alphaRoot = c(0, a_ul), + alphaLeaf = c(0, a_ul), + extraPar1 = c(0, 5), + extraPar2 = c(0, 6) + ), + c(1, 1, 1, -1, -1, -1) +) + +print(bounds) + +# Now the properly-ordered lower and upper limits can be accessed as follows: +bounds$lower + +bounds$lower +} diff --git a/man/objective_function.Rd b/man/objective_function.Rd new file mode 100644 index 0000000..2cb1e5a --- /dev/null +++ b/man/objective_function.Rd @@ -0,0 +1,702 @@ +\name{objective_function} + +\alias{objective_function} + +\title{Generate an objective function for BioCro model validation} + +\description{ + Given a base model definition, drivers to run the model, observed values of + model outputs, and the names of model arguments to vary, + \code{objective_function} creates an objective function that can be passed to + a minimization algorithm in order to find optimal parameter values that + produce the best agreement between the model and the observed data. + + The objective function itself is based on a weighted least-squares error + metric, with optional user-defined penalty terms, and an optional + regularization penalty term. + + It is possible to define a multi-year or multi-location objective function by + pairing particular sets of drivers with corresponding sets of observed model + outputs. + + It is also possible to include "dependent" model arguments, whose values are + determined from the "independent" model arguments that are varied during the + parameterization procedure. + + For a detailed example of using \code{objective_function}, see the + "Parameterizing Soybean-BioCro" vignette/article. +} + +\usage{ + objective_function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = list(), + normalization_method = 'mean_max', + normalization_param = NULL, + stdev_weight_method = 'equal', + stdev_weight_param = NULL, + regularization_method = 'none', + dependent_arg_function = NULL, + post_process_function = NULL, + extra_penalty_function = NULL, + verbose_startup = TRUE + ) +} + +\arguments{ + \item{base_model_definition}{ + A list meeting the requirements for BioCro + \code{\link[BioCro]{crop_model_definitions}}. + } + + \item{data_driver_pairs}{ + A list of named elements, where each element is a "data-driver pair." A + data-driver pair is a list with three required elements: \code{data}, + \code{drivers}, and \code{weight}. Optionally, it may also have a + \code{data_stdev} element. + + The \code{data} element must be a data frame with one column named + \code{time}, whose values follow BioCro's definition of + \code{\link[BioCro]{time}}; the other columns should represent observed + values of model outputs. Any \code{NA} values in \code{data} will + be ignored when calculating the error metric, but all non-\code{NA} values + of all columns (except \code{time}) will be compared to the model output. + + The \code{drivers} element must be a data frame that can be passed to + \code{\link[BioCro]{run_biocro}} as its \code{drivers} input argument. + + The \code{weight} element must be a single numeric value indicating a weight + to be used when calculating the error metric. + + The optional \code{data_stdev} element must be a data frame with the same + column names as \code{data}, and the same time values; its other entries + should represent the standard deviation associated with each entry in + \code{data}. If \code{data_stdev} is not supplied, all standard deviations + will be set to 1. + } + + \item{independent_args}{ + A list of named numeric values. The names will determine the independent + arguments to be varied during their optimization, and the values specify + "initial" or "test" values of each argument that will be used internally to + check that the objective function is properly defined and can be evaluated. + } + + \item{quantity_weights}{ + A list of named numeric values, where the name of each element is one of the + model outputs to be compared against the observed data, and the value is the + weight for that output. Each weight can be a single number, or a pair of + numbers. When the weight is a pair, the first number is the weight that will + be used for underestimates (when the modeled value is smaller than the + observed value), and the second is the weight for overestimates. + } + + \item{data_definitions}{ + A list of named string values, where the name of each element is one of the + data columns in the \code{data_driver_pairs} and the value is that column's + corresponding name in the model output. For all other data columns in the + \code{data_driver_pairs}, it is assumed that the data column name matches a + column in the model output. + } + + \item{normalization_method}{ + A string indicating the normalization method to be used when calculating the + error metric; see below for more details. + } + + \item{normalization_param}{ + An (optional) parameter value used by some normalization methods. When + \code{normalization_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + + \item{stdev_weight_method}{ + A string indicating the method to be used when calculating the + variance-based weights used in the error metric; see below for more details. + } + + \item{stdev_weight_param}{ + An (optional) parameter value used by some normalization methods. When + \code{stdev_weight_param} is \code{NULL}, a default value will be used, + which depends on the particular normalization method. Otherwise, the + user-specified value will be used. See details below. + } + + \item{regularization_method}{ + A string indicating the regularization method to be used when calculating + the regularization penalty term; see below for more details. + } + + \item{dependent_arg_function}{ + A function whose input argument is a named list of independent argument + values, and which returns a named list of dependent argument values. If the + \code{dependent_arg_function} is \code{NULL}, no dependent argument values + will be calculated. + } + + \item{post_process_function}{ + A function whose input argument is a data frame representing the output from + \code{\link[BioCro]{run_biocro}}, and which returns a data frame, typically + based on the input but with one or more new columns. If the + \code{post_process_function} is \code{NULL}, no post-processing will be + applied to the raw simulation output. + } + + \item{extra_penalty_function}{ + A function whose input argument is a data frame representing the output from + \code{\link[BioCro]{run_biocro}}, and which returns a numeric penalty to be + added to the least-squares term when calculating the error metric. If the + \code{extra_penalty_function} is \code{NULL}, no extra penalties will be + added. + } + + \item{verbose_startup}{ + A logical (\code{TRUE} or \code{FALSE}) value indicating whether to print + additional information to the R terminal when creating the objective + function. + } +} + +\details{ + \strong{Overview} + + When parameterizing a BioCro model, the general idea is to vary a subset of + the model's parameters to achieve the best agreement with a set of observed + data. The degree of agreement is expressed as an "error metric," which + includes terms derived from the agreement between the modeled and observed + values, as well as (optional) penalty terms. A function that calculates an + error metric when given a set of parameter values is called an "objective + function." Defining an objective function suitable for BioCro parameterization + can be complicated, but the \code{objective_function} function helps to + simplify the process of creating such a function. It is designed to + accommodate the following scenarios, which often occur in the context of + BioCro model parameterization: + + \itemize{ + \item \strong{Multi-year or multi-location data:} Often the model needs to + be run several times with different drivers corresponding to multiple + years or locations, and then the results from each individual + simulation must be compared to associated sets of observed data. Here, + this is handled through the \code{data_driver_pairs}, which allows the + user to specify which drivers and data sets should be compared to each + other. + + \item \strong{Complicated normalization:} Care must be taken to ensure that + certain years or output variables are not over-valued in the error + metric; for example, one year may have more observations of leaf mass + than another year, or the stem mass may be much larger than the leaf + mass. Here, this is handled through pre-set normalization approaches, + which can be specified through the \code{normalization_method} input. + See below for more information. + + \item \strong{Extra penalties:} Sometimes an optimizer chooses parameters + that produce close agreement with the observed data, but are + nevertheless not biologically resonable. For example, it may produce + a sharp peak at a high leaf mass in between two measured points, when + in reality, leaf mass should be nearly constant between them. In this + case, it may be necessarily to add an extra penalty to the objective + function that prevents the optimizer from choosing such values. Here, + this is handled through the \code{extra_penalty_function} input. + + \item \strong{Flexible weights:} Often a user would like to specify a weight + for each variable being considered in the error metric, either to + represent an uncertainty or to emphasize agreement with one output + at the expense of another. For example, the seed mass may need a high + weight to prioritize accurate yield predictions. Further, these + weights may need to differ for underestimates as compared to + overestimates; for example, measured root mass is often lower than the + true root mass, so a user may wish to penalize underestimates of root + mass more severely than overestimates. Here, these situations are + handled through the \code{quantity_weights} and + \code{stdev_weight_method} inputs. + + \item \strong{Dependent parameters:} Sometimes one model parameter must be + determined from one or more other parameters; for example, a user may + wish to require that the leaf and stem maintenance respiration + coefficients are identical. Here, this is handled through the + \code{dependent_arg_function}, which allows the user to specify how + the values of such "dependent" parameters should be determined from + the values of the "independent" parameters. + + \item \strong{Name mismatches:} Often a particular variable has different + names in the data set and simulation output. Here, this is handled + through the \code{data_definitions}, which allows the user to specify + which columns in the model output should be compared to particular + columns in the observed data. + + \item \strong{Incomplete outputs:} Sometimes a model may not produce outputs + that are directly comparable to the observed values; for example, a + model may calculate seed and shell mass, while a data set includes pod + mass, which is the sum of seed and shell. Here, this is handled by an + optional \code{post_process_function}, which allows users to specify + additional operations to perform on the model output; in this example, + it would be used to calculate the pod mass so it can be compared to + the observations. + } + + \strong{Error metric calculations} + + As mentioned above, the overall error metric \eqn{E} is calculated as + + \deqn{E = E_{agreement} + P_{user} + P_{regularization},} + where \eqn{E_{agreement}} is determined by the agreement between the model and + observations, \eqn{P_{user}} is an optional user-defined penalty, and + \eqn{P_{regularization}} is an optional regularization penalty. These terms + are explained in more detail below: + + \itemize{ + \item \strong{Agreement term:} The agreement term \eqn{E_{agreement}} is + calculated using a least-squares approach. In other words, + + \deqn{E_{agreement} = \sum_i \left(Y_{obs}^i - Y_{mod}^i \right)^2% + \cdot \frac{w_i^{quantity} w_i^{data} w_i^{stdev}}{N_i},} + where the sum runs over all \eqn{n} observations; \eqn{Y_{obs}^i} and + \eqn{Y_{mod}^i} are observed and modeled values of variable \eqn{Y_i}; + \eqn{w_i^{quantity}}, \eqn{w_i^{data}}, and \eqn{w_i^{stdev}} are + weight factors that depend on the name of \eqn{Y_i}, the data set that + includes the \eqn{i^{th}} observation, and the standard deviation + associated with \eqn{Y_{obs}^i}, respectively; and \eqn{N_i} is a + normalization factor. + + Each value of \eqn{Y_{obs}^i} is specified at a particular time + \eqn{t_i}. The corresponding modeled value, \eqn{Y_{mod}^i}, is found + by retrieving the value of the \eqn{Y_i} variable at the closest time + to \eqn{t_i} that is included in the model output. It is assumed that + the model always outputs the same sequence of time values each time it + is run with a particular set of drivers, regardless of the input + argument values. + + The quantity-based weight factors \eqn{w_i^{quantity}} are directly + specified by the user via the \code{quantity_weights} input. For + example, if \code{quantity_weights} has an element named \code{Leaf} + that is equal to 0.5, then \eqn{w_i} will be equal to 0.5 whenever + \eqn{Y_i} represents a leaf mass value, regardless of which set of + drivers or time point corresponds to \eqn{Y_i}. The weights can also + be supplied as \eqn{(w_{under}, w_{over})} pairs instead of single + values; in this case, the value of \eqn{w_i} depends on whether the + model makes an underprediction or an overprediction: + \eqn{w_i = w_{under}} when \eqn{Y_{mod}^i < Y_{obs}^i} and + \eqn{w_i = w_{over}} otherwise. + + The data-set-based weight factors \eqn{w_i^{data}} are directly + specified by the user via the \code{weight} element of each + data-driver pair. For example, if the second element of + \code{data_driver_pairs} has a \code{weight} of 2.0, then + \eqn{w_i^{data}} will be equal to 2.0 for all observations from the + corresponding data set. + + The standard-deviation-based weight factors \eqn{w_i^{stdev}} are + determined by the choice of \code{stdev_weight_method}; the available + methods are discussed below. + + The normalization factors \eqn{N_i} are determined by the choice of + \code{normalization_method}; the available methods are discussed + below. + + There are a few special cases where \eqn{E_{agreement}} is set to a + very high value (\code{BioCroValidation:::FAILURE_VALUE}). This is + done when a simulation fails to run, when the \eqn{E_{agreement}} term + would otherwise evaluate to \code{NA}, or when the \eqn{E_{agreement}} + term would otherwise evaluate to an infinite value. + + \item \strong{User-defined penalty term:} The user-defined penalty term + \eqn{P_{user}} is calculated by applying a function \eqn{f_{user}} to + the full simulation output from each set of drivers. In other words, + + \deqn{P_{user} = \sum_k f_{user} \left( M_k \right),} + where the sum runs over all \eqn{k} sets of drivers and \eqn{M_k} is + the model output when it is run with the \eqn{k^{th}} set of drivers. + + The function \eqn{f_{user}} must accept a single data frame as an + input and return a single numeric value as its output, but has no + other requirements. It is specified via the + \code{extra_penalty_function} argument. When + \code{extra_penalty_function} is \code{NULL}, \eqn{P_{user}} is zero. + + \item \strong{Regularization penalty term:} The regularization penalty term + \eqn{P_{regularization}} is calculated from the values of the + arguments being varied during the optimization by applying a function + \eqn{R}. In other words, + + \deqn{P_{regularization} = R \left( X \right),} + where \eqn{X} represents the model argument values. + + The function \eqn{R} is determined by the choice of + \code{regularization_method}; the available methods are discussed + below. + } + + \strong{Standard-deviation-based weight methods} + + The following methods are available for determining weight factors from values + of the standard deviation (\eqn{\sigma}), which can be (optionally) supplied + via the \code{data_stdev} elements of the \code{data_driver_pairs}: + + \itemize{ + \item \code{'equal'}: For this method, \eqn{w_i^{stdev}} is always set to + 1. In other words, all variances are treated as being equal, + regardless of any user-supplied values. This is usually the best + choice when values of \eqn{\sigma} are unavailable or cannot be + estimated. + + \item \code{'logarithm'}: For this method, \eqn{w_i^{stdev}} is calculated + as + + \deqn{w_i^{stdev} =% + ln \left( \frac{1}{\sigma_i + \epsilon} \right),} + where \eqn{ln} denotes a logarithm with base \eqn{e} and + \eqn{\epsilon} is a small number included to prevent numerical errors + that would otherwise occur when \eqn{\sigma = 0}. This method was + used in the original Soybean-BioCro paper. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-5} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} \approx 11.512} when \eqn{\sigma = 0}. + + Note: this method should be used with caution, because + \eqn{w_i^{stdev}} is zero for \eqn{\sigma_i = 1 - \epsilon}, and it + becomes negative for larger values of \eqn{\sigma_i}. + + \item \code{'inverse'}: For this method, \eqn{w_i^{stdev}} is calculated as + + \deqn{w_i^{stdev} = \frac{1}{\sigma_i^2 + \epsilon},} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{\sigma_i = 0}. + + The value of \eqn{\epsilon} is specified by the + \code{stdev_weight_param} input argument, which defaults to + \code{1e-1} when \code{stdev_weight_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{w_i^{stdev} = 10} when \eqn{\sigma_i = 0}. + } + + If any values of \eqn{w_i^{stdev}} are undefined, negative, or infinite, an + error message will occur (see the "Input checks" section below). + + \strong{Normalization methods} + + The following normalization methods are available: + + \itemize{ + \item \code{'equal'}: For this method, \eqn{N_i} is always set to 1. In + other words, no normalization is performed. + + \item \code{'mean'}: For this method, when \eqn{Y_i} is named \code{vtype} + and the observation is from a set called \code{vdata}, then + + \deqn{N_i = n_{vtype}^{vdata} \cdot n_{data},} + where \eqn{n_{vtype}^{vdata}} is the number of observations of type + \code{vtype} that are included in \code{vdata} and \eqn{n_{data}} is + the total number of data-driver pairs. In this case, the error term + \eqn{E_{agreement}} becomes a mean error across the full set of + drivers, hence the name for this method. This approach avoids + over-representing drivers with larger numbers of associated + observations when determining \eqn{E_{agreement}}. It also preserves + the overall magnitude of \eqn{E_{agreement}} when data-driver pairs + are added. + + \item \code{'max'}: For this method, when \eqn{Y_i} is named \code{vtype} + and the observation is from a set called \code{vdata}, then + + \deqn{N_i = \left( max_{vtype}^{vdata} \right)^2,} + where \eqn{max_{vtype}^{vdata}} is the maximum observed value of + \code{vtype} across \code{vdata}. In this case, the observed and + modeled values that appear in the equation for \eqn{E_{agreement}} are + essentially normalized by their maximum value, hence the name for this + method. This approach avoids over-representing variable types with + larger magnitude when determining \eqn{E_{agreement}}. + + \item \code{'obs'}: For this method, + + \deqn{N_i = \left( Y_{obs}^i \right)^2 + \epsilon,} + where \eqn{\epsilon} is a small number included to prevent numerical + errors that would otherwise occur when \eqn{Y_{obs}^i = 0}. In this + case, the equation for \eqn{E_{agreement}} essentially uses relative + differences rather than absolute differences, which is achieved by + normalizing by the observed values, hence the name. This approach + avoids over-representing time points where a particular quantity takes + its largest values when determining \eqn{E_{agreement}}. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, \eqn{N_i = 10} + when \eqn{Y_{obs}^i = 0}. + + \item \code{'mean_max'}: For this method, the "mean" and "max" methods are + combined so that + + \deqn{N_i = n_{vtype}^{vdrivers}% + \cdot n_{data} \cdot \left( max_{vtype}^{vdata} \right)^2.} + This approach avoids over-representing drivers with larger numbers of + associated observations, and variable types with larger magnitudes. + This method is used for parameterizing Soybean-BioCro. + + \item \code{'mean_obs'}: For this method, the "mean" and "obs" methods are + combined so that + + \deqn{N_i = n_{vtype}^{vdrivers} \cdot n_{data}% + \cdot \left( \left( Y_{obs}^i \right)^2 + \epsilon \right)^2.} + This approach avoids over-representing drivers with larger numbers of + associated observations, and time points with larger observed values. + + The value of \eqn{\epsilon} is specified by the + \code{normalization_param} input argument, which defaults to + \code{1e-1} when \code{normalization_param} is \code{NULL} when using + this method. With the default value of \eqn{\epsilon}, + \eqn{N_i = 10 \cdot n_{vtype}^{vdrivers} \cdot n_{data}} when + \eqn{Y_{obs}^i = 0}. + } + + In most situations, it is recommended to use either \code{'mean_max'} or + \code{'mean_obs'} depending on user preference or performance. + + \strong{Regularization methods} + + The following regularization methods are available: + + \itemize{ + \item \code{'none'}: For this method, \eqn{P_{regularization}} is always set + to 0. In other words, no regularization is performed. + + \item \code{'L1'} or \code{'lasso'}: For this method, + \eqn{P_{regularization}} is given by the sum of the absolute values of + each independent argument, multiplied by a "regularization parameter" + \eqn{\lambda} that sets the overall weight of the penalty: + + \deqn{P_{regularization} = \lambda \sum_j | X_j |,} + where the sum runs over all \eqn{j} independent arguments, and + \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" + section below for details of how to specify \eqn{\lambda}. + + \item \code{'L2'} or \code{'ridge'}: For this method, + \eqn{P_{regularization}} is given by the sum of the squared values of + each independent argument, multiplied by a "regularization parameter" + \eqn{\lambda} that sets the overall weight of the penalty: + + \deqn{P_{regularization} = \lambda \sum_j X_j^2,} + where the sum runs over all \eqn{j} independent arguments, and + \eqn{X_j} is the value of the \eqn{j^{th}} argument. See the "Value" + section below for details of how to specify \eqn{\lambda}. + } + + \strong{Input checks} + + Several checks are made to ensure that the objective function is properly + defined. These checks include, but are not limited to, the following: + + \itemize{ + \item Ensuring that each set of drivers in \code{data_driver_pairs} defines + a valid dynamical system along with the \code{base_model_definition}. + This is accomplished using + \code{\link[BioCro]{validate_dynamical_system_inputs}}. + + \item Ensuring that the model output corresponding to each set of drivers + spans the times at which the observations were made. + + \item Ensuring that each variable type in the data elements of + \code{data_driver_pairs} matches a corresponding column in the model + output, when accounting for the \code{data_definitions} and + \code{post_process_function}. + + \item Ensuring that each independent and dependent argument name is either + a parameter or initial value of the model. Internally, argument names + are passed to \code{\link[BioCro]{partial_run_biocro}} via its + \code{arg_names} input. Note that argument names passed to + \code{partial_run_biocro} can technically include drivers, but it is + unlikely that the value of a driver would be varied during an + optimization, so the argument names are not allowed to include + columns in the drivers. + + \item Ensuring that the optional \code{dependent_arg_function}, + \code{post_process_function}, and \code{extra_penalty_function} + functions can be run without causing errors. + + \item Ensuring that certain values are finite (such as \eqn{Y_{obs}}, + \eqn{\sigma_i}, \eqn{w_i^{stdev}}, and \eqn{N_i}), and that certain + values are not negative (such as \eqn{\sigma_i}, \eqn{w_i^{stdev}}, + and \eqn{N_i}). + } + + If any issues are detected, an informative error message will be sent. Note + that several of these checks require running the model with each set of + drivers. For these checks, the argument values specified by + \code{independent_args} will be used, so they should be valid or otherwise + reasonable values. + + If an error message occurs when \code{verbose_startup} was set to + \code{FALSE}, it is recommended to call this function again with + \code{verbose_startup} set to \code{TRUE}, since the additional output can be + helpful for troubleshooting. +} + +\value{ + A function \code{obj_fun} with signature + \code{obj_fun(x, lambda = 0, return_terms = FALSE, debug_mode = FALSE)}. + + Here, \code{x} is a numeric vector of values of the independent arguments (in + the same order as in \code{independent_arg_names}), and \code{lambda} is the + value of the regularization parameter. + + The \code{return_terms} argument determines the return value of + \code{obj_fun}. When \code{return_terms} is \code{FALSE}, \code{obj_fun} + returns values of the error metric \eqn{E}. When \code{return_terms} is + \code{TRUE}, \code{obj_fun} returns a list including each individual term of + the total error metric. + + During optimization, \code{return_terms} should always be \code{FALSE}. + Setting it to \code{TRUE} can be useful for troubleshooting, or for + diagnostics such as checking the quality of fit for each of the data-driver + pairs. + + The \code{debug_mode} argument determines whether \code{obj_fun} is running in + debug mode. In debug mode, each time \code{obj_fun} is called, it will print + the values of \code{x} and the error metric to the R terminal. This can be + useful when troubleshooting a problem with an optimization, since it provides + a record of any problematic parameter combinations. When setting + \code{debug_mode} to \code{TRUE}, also consider using \code{\link[base]{sink}} + to write the outputs to a file instead of the R terminal. In that case, there + will still be a record even if R crashes. +} + +\examples{ +# Example: Create an objective function that enables optimization of the +# `alphaLeaf`, `betaLeaf`, and `alphaStem` parameters of the Soybean-BioCro +# model. Additional details are provided below. Important note: This example is +# designed to highlight key features of `objective_function`, and is not +# necessarily realistic. + +if (require(BioCro)) { + # We will use Soybean-BioCro as the base model definition, but we will change + # the ODE solver to use the Euler method so the model runs faster. + base_model_definition <- BioCro::soybean + base_model_definition$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] + + # We will use the `soyface_biomass` data set (included with the + # `BioCroValidation` package) for the observed values; this set includes + # observations of leaf, stem, and pod biomass from two years, which are stored + # in two data tables; it also includes the standard deviations of the measured + # biomasses, which are included in two separate tables. However, these data + # tables each have a `DOY` column rather than a `time` column, so we need to + # alter them. The tables also include other columns we do not wish to use in + # this example. So, we will define a short helper function that can be used to + # pre-process each table. + process_table <- function(x) { + within(x, { + # Define new `time` column + time = (DOY - 1) * 24.0 + + # Remove unneeded columns + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL + }) + } + + # The data-driver pairs can now be created by associating each data set with + # its corresponding weather data. Here we will weight the 2005 data twice as + # heavily as the 2002 data. + data_driver_pairs <- list( + ambient_2002 = list( + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), + drivers = BioCro::soybean_weather[['2002']], + weight = 1 + ), + ambient_2005 = list( + data = process_table(soyface_biomass[['ambient_2005']]), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']]), + drivers = BioCro::soybean_weather[['2005']], + weight = 2 + ) + ) + + # In the data, the leaf biomass is in the `Leaf_Mg_per_ha` column, but in the + # simulation output, it is in the `Leaf` column. Similar naming differences + # occur for the stem and pod mass. To address this, we can provide a data + # definition list. + data_definitions <- list( + Leaf_Mg_per_ha = 'Leaf', + Stem_Mg_per_ha = 'Stem', + Rep_Mg_per_ha = 'Pod' + ) + + # The data contains values of pod mass, but the model does not calculate pod + # mass; instead, it returns separate values of `Grain` (seed) and `Shell` + # mass, two components which form the pod together. To address this, we can + # provide a post-processing function to calculate the pod mass. + post_process_function <- function(sim_res) { + within(sim_res, {Pod = Grain + Shell}) + } + + # Here we wish to independently vary the `alphaLeaf` and `betaLeaf` + # parameters. We also wish to vary `alphaStem`, but require that its value is + # always equal to `alphaLeaf`. To do this, we can specify independent + # arguments, and a function for determining dependent argument values. We will + # choose "test" values of the independent arguments as their values in the + # original Soybean-BioCro model. + independent_args <- BioCro::soybean[['parameters']][c('alphaLeaf', 'betaLeaf')] + + dependent_arg_function <- function(ind_args) { + list(alphaStem = ind_args[['alphaLeaf']]) + } + + # When determining the error metric value, we wish to weight the pod highest + # to ensure a close fit to the observed pod masses. We also wish to decrease + # the penalty for overestimates of the stem mass, since we believe our + # observations to be underestimates. + quantity_weights <- list( + Leaf = 0.5, + Stem = c(0.5, 0.25), + Pod = 1 + ) + + # We want to prevent the optimizer from choosing parameters that produce + # unreasonably high leaf mass + extra_penalty_function <- function(sim_res) { + max_leaf <- max(sim_res[['Leaf']], na.rm = TRUE) + + if (is.na(max_leaf) || max_leaf > 4) { + 1e5 # Add a steep penalty + } else { + 0 + } + } + + # Now we can finally create the objective function + obj_fun <- objective_function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = data_definitions, + stdev_weight_method = 'logarithm', + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function, + extra_penalty_function = extra_penalty_function + ) + + # This function could now be passed to an optimizer; here we will simply + # evaluate it for two sets of parameter values. + + # Try doubling each parameter value; in this case, the value of the + # objective function increases, indicating a lower degree of agreement between + # the model and the observed data. Here we will call `obj_fun` in debug mode, + # which will automatically print the value of the error metric. + cat('\nError metric calculated by doubling the initial argument values:\n') + error_metric <- obj_fun(2 * as.numeric(independent_args), debug_mode = TRUE) + + # We can also see the values of each term that makes up the error metric; + # again, we will call `obj_fun` in debug mode for automatic printing. + cat('\nError metric terms calculated by doubling the initial argument values:\n') + error_terms <- + obj_fun(2 * as.numeric(independent_args), return_terms = TRUE, debug_mode = TRUE) +} +} diff --git a/man/soyface_biomass.Rd b/man/soyface_biomass.Rd new file mode 100644 index 0000000..e8a0dd7 --- /dev/null +++ b/man/soyface_biomass.Rd @@ -0,0 +1,63 @@ +\name{soyface_biomass} + +\docType{data} + +\alias{soyface_biomass} + +\title{SoyFACE Soybean Biomass Data} + +\description{ + Two years of soybean biomass data collected at the SoyFACE facility in + Champaign, IL during the 2002 and 2005 field seasons. +} + +\usage{soyface_biomass} + +\format{ + A list of four named elements, where each element is a data frame with the + following columns: + \itemize{ + \item \code{DOY}: The day of year + \item \code{Leaf_Mg_per_ha}: Leaf biomass per area expressed in Mg / ha + \item \code{Stem_Mg_per_ha}: Stem biomass per area expressed in Mg / ha + \item \code{Rep_Mg_per_ha}: Pod biomass per area expressed in Mg / ha + \item \code{Seed_Mg_per_ha}: Seed biomass per area expressed in Mg / ha + \item \code{Litter_Mg_per_ha}: Mass of leaf litter accumulated between + harvests, expressed in Mg / ha + \item \code{CumLitter_Mg_per_ha}: Cumulative leaf litter biomass per aear + expressed in Mg / ha + } + + The elements named \code{ambient_2002} and \code{ambient_2005} represent mean + biomass values measured from plants grown in ambient CO2 conditions during + 2002 and 2005, respectively. + + The elements named \code{ambient_2002_std} and \code{ambient_2005_std} + represent the standard deviation of biomass values measured from plants grown + in ambient CO2 conditions during 2002 and 2005, respectively. +} + +\source{ + The leaf, stem, and pod data was obtained from several files in from the + \href{https://github.com/cropsinsilico/soybean-biocro}{Soybean-BioCro GitHub repository}: + \itemize{ + \item \code{Data/SoyFACE_data/2002_ambient_biomass.csv} + \item \code{Data/SoyFACE_data/2005_ambient_biomass.csv} + \item \code{Data/SoyFACE_data/2002_ambient_biomass_std.csv} + \item \code{Data/SoyFACE_data/2005_ambient_biomass_std.csv} + } + + See that repository for more information. + + The leaf litter accumulated between harvests was obtained from the original + sources. The cumulative leaf litter was calculated from the amount accumulated + between harvests. The seed mass was estimated as a fraction of the total pod + mass, using proportionality factors determined from unpublished data collected + in Champaign, Illinois during 2021-2022. + + These data tables have not been published previously, but were used to + parameterize Soybean-BioCro as used in He \emph{et al.} 2024 + (\doi{10.1093/insilicoplants/diae009}). +} + +\keyword{datasets} diff --git a/man/update_model.Rd b/man/update_model.Rd new file mode 100644 index 0000000..f3a86ae --- /dev/null +++ b/man/update_model.Rd @@ -0,0 +1,70 @@ +\name{update_model} + +\alias{update_model} + +\title{Update a BioCro model definition} + +\description{ + Following an optimization, it is typically necessary to update the initial + values and/or parameters of a base model definition with new values determined + during the optimization. The \code{update_model} function helps to streamline + this process. It is expected that this function will be called after + \code{\link{objective_function}}. +} + +\usage{ + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = NULL + ) +} + +\arguments{ + \item{base_model_definition}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{independent_args}{ + The same value that was passed to \code{\link{objective_function}}. + } + + \item{new_ind_arg_values}{ + A numeric vector representing new values of the independent arguments, + typically determined by an optimizer. + } + + \item{dependent_arg_function}{ + The same value that was passed to \code{\link{objective_function}}. + } +} + +\value{ + A list based on \code{base_model_definition} but with new values of some of + its \code{initial_values} and \code{parameters}, as specified by the elements + of \code{independent_args} and \code{new_ind_arg_values}. +} + +\examples{ +if (require(BioCro)) { + # Update the default Soybean-BioCro model with new values of `Leaf` (an + # initial value) and `alphaStem` (a parameter) + base_model <- BioCro::soybean + + new_model <- update_model( + base_model, + list(Leaf = 1, alphaLeaf = 2), # The values here will not be used + c(1000, 2000) # These are the actual new values + ) + + # Compare the two models + cat('\n\nComparing initial Leaf values:\n') + print(base_model$initial_values$Leaf) + print(new_model$initial_values$Leaf) + + cat('\n\nComparing alphaLeaf values:\n') + print(base_model$parameters$alphaLeaf) + print(new_model$parameters$alphaLeaf) +} +} diff --git a/tests/testthat/test-bounds_table.R b/tests/testthat/test-bounds_table.R new file mode 100644 index 0000000..7790528 --- /dev/null +++ b/tests/testthat/test-bounds_table.R @@ -0,0 +1,98 @@ +independent_args <- list(param1 = 0.1, param2 = 0.2, param3 = 0.3) + +bounds_list <- list( + param1 = c(0, 1), + param3 = c(4, 3), + param2 = c(1, 2), + param4 = c(7, 8) +) + +test_that('Bounds tables can be created', { + # No initial values + expect_silent( + bounds_table( + independent_args, + bounds_list + ) + ) + + # Initial values + bounds <- expect_silent( + bounds_table( + independent_args, + bounds_list, + c(0.5, 1.5, 3.5) + ) + ) + + expect_equal( + bounds[['arg_name']], + c('param1', 'param2', 'param3') + ) + + expect_equal( + bounds[['lower']], + c(0, 1, 3) + ) + + expect_equal( + bounds[['upper']], + c(1, 2, 4) + ) +}) + +test_that('Values outside the bounds are detected', { + expect_error( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.5, 3.5) + ), + 'The initial values for the following arguments lie outside the bounds: param2' + ) +}) + +test_that('Values on the bounds are detected', { + expect_warning( + bounds_table( + independent_args, + bounds_list, + c(0.5, 2.0, 3.5) + ), + 'The initial values for the following arguments lie on the bounds, which can be problematic for some optimizers: param2' + ) +}) + +test_that('Missing bounds are detected', { + expect_error( + bounds_table( + independent_args, + 1.0 + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + list(1, 2, 3) + ), + '`bounds_list` must be a list of named elements' + ) + + expect_error( + bounds_table( + independent_args, + bounds_list[1:2] + ), + 'The following elements were included in `independent_args` but not `bounds_list`: param2' + ) + + expect_error( + bounds_table( + independent_args, + within(bounds_list, {param1 = 1.0}) + ), + 'The following elements of `bounds_list` do not have a length of 2: param1' + ) +}) diff --git a/tests/testthat/test-objective_function.R b/tests/testthat/test-objective_function.R new file mode 100644 index 0000000..dcfccf1 --- /dev/null +++ b/tests/testthat/test-objective_function.R @@ -0,0 +1,526 @@ +# Specify key inputs to use for these tests +model <- BioCro::soybean +model$ode_solver <- BioCro::default_ode_solvers[['homemade_euler']] + +process_table <- function(x) { + within(x, { + time = (DOY - 1) * 24.0 + DOY = NULL + Seed_Mg_per_ha = NULL + Litter_Mg_per_ha = NULL + CumLitter_Mg_per_ha = NULL + }) +} + +ddps <- list( + ambient_2002 = list( + data = process_table(soyface_biomass[['ambient_2002']]), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']]), + drivers = BioCro::soybean_weather[['2002']], + weight = 1 + ), + ambient_2005 = list( + data = process_table(soyface_biomass[['ambient_2005']]), + drivers = BioCro::soybean_weather[['2005']], + weight = 2 + ) +) + +independent_args <- with(BioCro::soybean[['parameters']], { + list(alphaLeaf = alphaLeaf, betaLeaf = betaLeaf) +}) + +data_definitions <- list( + Leaf_Mg_per_ha = 'Leaf', + Stem_Mg_per_ha = 'Stem', + Rep_Mg_per_ha = 'Pod' +) + +dependent_arg_function <- function(x) { + list(alphaStem = x[['alphaLeaf']]) +} + +post_process_function <- function(x) { + within(x, {Pod = Grain + Shell}) +} + +quantity_weights <- list( + Leaf = 0.5, + Stem = c(0.5, 0.25), + Pod = 1 +) + +verbose_startup <- FALSE + +# Run tests +test_that('Objective functions can be created and behave as expected', { + # Two data-driver pairs, no dependent arguments + obj_fun <- expect_silent( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ) + ) + + expect_silent( + obj_fun(as.numeric(independent_args)) + ) + + # One data-driver pair, no dependent arguments + obj_fun <- expect_silent( + objective_function( + model, + ddps[1], + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ) + ) + + # Two data-driver pairs, with dependent arguments and L2 regularization + obj_fun <- expect_silent( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function, + regularization_method = 'L2', + verbose_startup = verbose_startup + ) + ) + + expect_silent( + obj_fun(as.numeric(independent_args), lambda = 0.5) + ) + + expect_true( + is.list( + obj_fun(as.numeric(independent_args), lambda = 0.5, return_terms = TRUE) + ) + ) +}) + +test_that('Bad definitions are detected', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2005$drivers$temp = NULL}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The following drivers did not form a valid dynamical system: ambient_2005' + ) +}) + +test_that('Independent and dependent arguments must have names', { + expect_error( + objective_function( + model, + ddps, + as.numeric(independent_args), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + '`independent_args` must have names' + ) + + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + dependent_arg_function = function(x) {1.0}, + verbose_startup = verbose_startup + ), + 'The return value of `dependent_arg_function` must have names' + ) + + expect_error( + objective_function( + model, + ddps, + c(independent_args, list(solar = 1000)), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + dependent_arg_function = function(x) {list(precip = 0.1)}, + verbose_startup = verbose_startup + ), + 'Some independent or dependent argument names refer to columns in the drivers: solar, precip' + ) +}) + +test_that('Bad argument names are detected', { + expect_error( + objective_function( + model, + ddps, + c(independent_args, list(bad_arg_name = 1)), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Model runners could not be created for the following drivers: +ambient_2002: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers` +ambient_2005: Error: `bad_arg_name` from `arg_names` is not in the `initial_values`, `parameters`, or `drivers`', + fixed = TRUE + ) +}) + +test_that('Model failures are detected', { + expect_error( + objective_function( + within(model, {parameters$lnfun = 1}), + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported. +ambient_2005: Error in as.data.frame(.Call(R_run_biocro, initial_values, parameters, : Caught exception in R_run_biocro: Thrown by the multilayer_canopy_properties module: lnfun != 0 is not yet supported.', + fixed = TRUE + ) +}) + +test_that('Data-driver pairs must have correct elements', { + expect_error( + objective_function( + model, + list(), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + '`data_driver_pairs` must have at least one element' + ) + + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data = NULL; ambient_2005$drivers = NULL}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The following data-driver pairs are missing at least one required element (drivers, data, weight): ambient_2002, ambient_2005', + fixed = TRUE + ) + + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$extra_element = 5}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The following data-driver pairs have unexpected elements: ambient_2002. The allowed elements are: drivers, data, weight, data_stdev.' + ) + + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data_stdev = 5}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The following data-driver pairs have a `data_stdev` element that does not match the columns and/or times of their `data` element: ambient_2002' + ) +}) + +test_that('Data must have a `time` column', { + expect_error( + objective_function( + model, + within(ddps, {ambient_2002$data$time = NULL}), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The following data-driver pairs are missing a `time` column in their `data` element: ambient_2002' + ) +}) + +test_that('Missing simulation outputs are detected', { + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + verbose_startup = verbose_startup + ), + 'Some data columns were missing from runner outputs: +ambient_2002: Pod +ambient_2005: Pod', + fixed = TRUE + ) +}) + +test_that('Out-of-range times are detected', { + time_offset <- 1e5 + + expect_error( + objective_function( + model, + within(ddps, { + ambient_2002$data$time <- ambient_2002$data$time + time_offset + ambient_2002$data_stdev$time <- ambient_2002$data_stdev$time + time_offset + }), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Some observed times were missing from runner outputs: +ambient_2002: 104272, 104512, 104848, 105184, 105520, 105880, 106192, 106888 (min_time = 3624, max_time = 6911)', + fixed = TRUE + ) +}) + +test_that('Multiple time matches are handled', { + # The drivers have a time step of 1, so if we specify half-integer times, + # there will actually be two "closest" points to each observed time. + time_offset <- 0.5 + + expect_silent( + objective_function( + model, + within(ddps, { + ambient_2002$data$time <- ambient_2002$data$time + time_offset + ambient_2002$data_stdev$time <- ambient_2002$data_stdev$time + time_offset + }), + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ) + ) +}) + +test_that('Weights must be supplied for all measured quantities', { + expect_error( + objective_function( + model, + ddps, + independent_args, + list(), + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Weights were not supplied for the following measured quantities: Leaf, Stem, Pod' + ) +}) + +test_that('Bad normalization methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + normalization_method = 'bad_normalization_method', + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Unsupported normalization_method: bad_normalization_method' + ) +}) + +test_that('Bad variance methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + stdev_weight_method = 'bad_stdev_method', + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Unsupported stdev_weight_method: bad_stdev_method' + ) +}) + +test_that('Bad return values are detected', { + # A penalty evaluates to NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + extra_penalty_function = function(x) {NA}, + verbose_startup = verbose_startup + ), + 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' + ) + + # A predicted value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, {Pod = NA})}, + verbose_startup = verbose_startup + ), + 'The objective function did not return a finite value when using the initial argument values; instead, it returned: NA' + ) + + # A post-processing function removes the `time` column + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {within(x, { + Pod = Grain + Shell + time = NULL + })}, + verbose_startup = verbose_startup + ), + 'Some data columns were missing from runner outputs: +ambient_2002: time +ambient_2005: time', + fixed = TRUE + ) + + # A post-processing function doesn't return a data frame + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = function(x) {1.0}, + verbose_startup = verbose_startup + ), + 'Some runners did not produce data frames: ambient_2002, ambient_2005' + ) +}) + +test_that('Bad regularization methods are detected', { + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + regularization_method = 'bad_regularization_method', + verbose_startup = verbose_startup + ), + 'Unsupported regularization method: bad_regularization_method' + ) +}) + +test_that('Bad data values and weights are detected', { + expect_error( + objective_function( + model, + within(ddps, { + ambient_2005$data_stdev = process_table(soyface_biomass[['ambient_2005_std']]) + ambient_2005$data_stdev[['Leaf_Mg_per_ha']] <- -0.1 + }), + independent_args, + quantity_weights, + stdev_weight_method = 'inverse', + stdev_weight_param = 0, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'Issues were found with the following data sets: + ambient_2002: + The following columns contained non-finite values: w_var + ambient_2005: + The following columns contained non-finite values: w_var + The following columns contained negative values: quantity_stdev', + fixed = TRUE + ) +}) + +test_that('NA argument values and predicted values are handled', { + # An independent argument value is NA + expect_error( + objective_function( + model, + ddps, + within(independent_args, {alphaLeaf = NA}), + quantity_weights, + data_definitions = data_definitions, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) + + # A dependent argument value is NA + expect_error( + objective_function( + model, + ddps, + independent_args, + quantity_weights, + data_definitions = data_definitions, + dependent_arg_function = function(x) {list(alphaStem = NA)}, + post_process_function = post_process_function, + verbose_startup = verbose_startup + ), + 'The model could not be run with the following drivers: +ambient_2002: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite +ambient_2005: Error in runner(as.numeric(independent_args)): At least one independent or dependent argument value is not finite', + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-update_model.R b/tests/testthat/test-update_model.R new file mode 100644 index 0000000..93d74eb --- /dev/null +++ b/tests/testthat/test-update_model.R @@ -0,0 +1,83 @@ +NEWVAL <- 10000 + +base_model_definition <- BioCro::soybean +independent_args <- list(alphaLeaf = NEWVAL, Leaf = NEWVAL) +new_ind_arg_values <- c(NEWVAL, NEWVAL) +dependent_arg_function <- function(x) {list(alphaStem = x$alphaLeaf)} + +test_that('A base model definition can be updated', { + # Without dependent arguments + expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values + ) + ) + + # With dependent arguments + new_model <- expect_silent( + update_model( + base_model_definition, + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ) + ) + + # Initial values are updated + expect_equal( + new_model[['initial_values']][['Leaf']], + NEWVAL + ) + + # Other initial values remain the same + expect_equal( + new_model[['initial_values']][['Stem']], + base_model_definition[['initial_values']][['Stem']] + ) + + # Dependent parameters are updated + expect_equal( + new_model[['parameters']][['alphaStem']], + NEWVAL + ) + + # Other parameters remain the same + expect_equal( + new_model[['parameters']][['alphaRoot']], + base_model_definition[['parameters']][['alphaRoot']] + ) +}) + +test_that('Base model definition must be valid', { + expect_error( + update_model( + base_model_definition[c('direct_modules', 'differential_modules')], + independent_args, + new_ind_arg_values, + dependent_arg_function = dependent_arg_function + ), + 'The `base_model_definition` must have the following elements: initial_values, parameters' + ) +}) + +test_that('Supplied arguments must be part of the base model definition', { + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + new_ind_arg_values + ), + '`independent_args` and `new_ind_arg_values` must have the same length' + ) + + expect_error( + update_model( + base_model_definition, + c(independent_args, list(bad_arg = 10)), + c(new_ind_arg_values, 25) + ), + 'Values were supplied for the following quantities, but they are not `initial_values` or `parameters` of the `base_model_definition`: bad_arg' + ) +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/BioCroValidation.Rmd b/vignettes/BioCroValidation.Rmd new file mode 100644 index 0000000..4982053 --- /dev/null +++ b/vignettes/BioCroValidation.Rmd @@ -0,0 +1,92 @@ +--- +title: "Getting Started With BioCroValidation" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Getting Started With BioCroValidation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 6, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +**BioCroValidation** is an R package that provides a suite of tools for +validating BioCro crop growth models. + +Validation is a key part of using and developing BioCro models. The goal of this +package is to provide some convenient "helping" functions to aid with various +aspects of model validation, especially parameterization. + +The central tool in the package is the `objective_function` function. As its +name implies, it can be used to create an objective function that calculates the +value of an error metric value given a set of model parameter values. In turn, +the objective function itself can be passed to an optimizer. + +While it is possible for each BioCro user to write their own customized +objective function, creating one can be a very complex process because there are +many potential aspects to consider: + +- **Mathematical / Statistical Approach:** This refers to choices like "Do I + want to normalize the error terms by each observed value, or by the largest + observed value for each measured quantity?" or "Do I want to use L1 or L2 + regularization?" + +- **Implementation of Mathematical / Statistical Approach:** Once an approach + has been identified, code must be written to properly implement it. + +- **Error Checks:** A wide variety of strange conditions can occur during + parameterization, and the objective function must be ready to handle them. For + example, how should the objective function respond when a simulation does not + run to completion, or when an optimizer passes `NA` as a parameter value? + +- **Technical Details:** Parameterization can take a long time to perform, so it + is important for the objective function code to be as efficient and fast as + possible. + +The goal of `objective_function` is to allow users to make the key choices about +their mathematical approach using clear code statements like +`regularization_method = 'L2'`, while the implementation details, error checks, +and other technical details are handled internally. This will result in clear +scripts that are also more reliable. + +Besides `objective_function`, the package also includes a few other functions +with a similar goal of clarifying code and hiding implementation details, such +as `bounds_table` and `update_model`. + +# Installing BioCroValidation + +The easiest way to install `BioCroValidation` is to type the following from +within an R terminal: + +```{r, eval = FALSE} +remotes::install_github('biocro/BioCroValidation') +``` + +Note that this method requires the `remotes` package, which can be installed +from within R by typing `install.packages('remotes')`. + +# Learning More + +The `BioCroValidation` package includes extensive documentation. The best place +to start is the +[Parameterizing Soybean-BioCro](parameterizing_soybean_biocro.html) +article, which illustrates the full process of defining an objective function, +running an optimization, examining the results, and saving the new model +definition. + +Another key resource is the help page for `objective_function`, which can be +accessed online or by typing `?objective_function` in an R terminal. This +document explains all the available options for normalization approaches, +regularization approaches, and other aspects of defining an objective function. diff --git a/vignettes/parameterizing_soybean_biocro.Rmd b/vignettes/parameterizing_soybean_biocro.Rmd new file mode 100644 index 0000000..651894a --- /dev/null +++ b/vignettes/parameterizing_soybean_biocro.Rmd @@ -0,0 +1,822 @@ +--- +title: "Parameterizing Soybean-BioCro" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Parameterizing Soybean-BioCro} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +bibliography: references.bib +link-citations: yes +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7.5, + fig.height = 5, + fig.align = "center" +) +``` + +# Overview + +This article shows how to create an objective function that can be used to +parameterize BioCro's soybean model +[@matthews_soybean_biocro_2021; @lochocki_biocro_2022]. + +Since the original publication of Soybean-BioCro, the BioCro module library has +undergone several changes, and the model has been re-parameterized several +times. These parameterizations did not use `BioCroValidation`, since they were +performed before `BioCroValidation` was created. + +However, `BioCroValidation` is able to re-create the objective functions that +were used for these parameterizations. Here, we re-create the objective function +that was used for the parameterization included in version `3.2.0` of the BioCro +R package. + +In the commands below, we will use functions from several libraries, so we will +load them now: + +```{r libraries} +# Load required libraries +library(BioCroValidation) +library(BioCro) +library(dfoptim) +library(lattice) +``` + +# Building the Objective Function + +In this section, we will use the `objective_function` function from +`BioCroValidation` package to create an objective function that can be used to +parameterize Soybean-BioCro. For more details about this, please see the help +page for `objective_function` by typing `?objective_function` from an R +terminal. + +## The Base Model Definition + +We first need a base model definition that includes the necessary modules, +initial values, parameters, and differential equation solver specifications. For +this example, we will simply use Soybean-BioCro as the base model, with just one +small change: we will use an Euler solver rather than the default solver, which +will help make the optimization run faster. For reasonable sets of parameter +values, the Euler solver does not seem to cause any substantial errors when +running Soybean-BioCro. + +```{r base_model_definition} +# Specify the base model definition +base_model_definition <- soybean +base_model_definition$ode_solver <- default_ode_solvers[['homemade_euler']] +``` + +## The Observed Data + +The observed data needed to parameterize Soybean-BioCro is included in the +`BioCroValidation` package as the `soyface_biomass` data set, which consists of +two years (2002 and 2005) of biomass data and associated standard deviations, +included in four separate tables. However, each table requires some +pre-processing to get it ready. + +One issue is that the data set specifies the doy of year (DOY) for each harvest, +but we need to specify the time using BioCro's convention (the number of hours +since the start of the year). + +Another issue is that the data set includes pod and seed values, but +Soybean-BioCro calculates shell and seed masses, where the shell and seed +together comprise the pod. + +Although the observations do not include root biomass, it is nevertheless +important to constrain the predicted root mass to reasonable values. To do this, +it is assumed that the maximum root mass is seventeen percent of the maximum +aboveground biomass, and that it is achieved at the same time as maximum +above-ground biomass, based on observations reported in @ordonez_root_2020. In +the observed data, the sum of stem and leaf mass is largest at the fifth time +point in both years. So, root mass is estimated at this single time point and +added to the observed values. + +In previous parameterizations, a standard deviation for the root mass was not +explicitly estimated; instead, the standard-deviation-based weight factor was +implicitly set to 1. Because the `'logarithm'` method with +\(\epsilon = 10^{-5}\) was used, a weight factor of 1 implies a standard +deviation of \(1 / e - 10^{-5} \approx 0.3678694\). See the documentation page +(`?objective_function`) for more information about this weighting method. + +Finally, the data set includes some values that are not needed for the +parameterization. This includes the leaf litter accumulated between each +harvest, as well as the `DOY` and `Rep_Mg_per_ha` columns that have been +superseded by other columns defined above. + +Here we will define a helping function that can accomplish the required +modifications described above; note that some operations are different depending +on whether the table represents biomass values or standard deviations: + +```{r process_tables} +# Define a helping function for processing data tables +process_table <- function(data_table, type) { + # Define new `time` column + data_table$time <- (data_table$DOY - 1) * 24.0 + + # Define new `Shell_Mg_per_ha` column + data_table$Shell_Mg_per_ha <- if (type == 'biomass') { + # The shell is all parts of the pod other than the seed + data_table$Rep_Mg_per_ha - data_table$Seed_Mg_per_ha + } else { + # Add uncertainties in quadrature, a simple approach to error propagation + sqrt(data_table$Rep_Mg_per_ha^2 + data_table$Seed_Mg_per_ha^2) + } + + # Define new `Root_Mg_per_ha` column, which has just one non-NA value + row_to_use <- 5 # Choose row to use + data_table$Root_Mg_per_ha <- NA # Initialize all values to NA + + if (type == 'biomass') { + # Estimate a mass at one time point + col_to_add <- c( + 'Leaf_Mg_per_ha', + 'Stem_Mg_per_ha', + 'Rep_Mg_per_ha' + ) + + data_table[row_to_use, 'Root_Mg_per_ha'] <- + 0.17 * sum(data_table[row_to_use, col_to_add]) + } else { + # Estimate standard deviation at one time point + data_table[row_to_use, 'Root_Mg_per_ha'] <- 1 / exp(1) - 1e-5 + } + + # Remove columns by setting them to NULL + data_table$DOY = NULL + data_table$Rep_Mg_per_ha = NULL + data_table$Litter_Mg_per_ha = NULL + + # Return the processed table + data_table +} +``` + +## The Data-Driver Pairs + +The `BioCro` R package includes weather data for the years in the +`soyface_biomass` data set. So now we are ready to define the data-driver pairs, +which includes the weather, the observed biomass, the standard deviation of the +observed biomass, and the weight to assign to each year: + +```{r data_driver_pairs} +# Define the data-driver pairs +data_driver_pairs <- list( + ambient_2002 = list( + data = process_table(soyface_biomass[['ambient_2002']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2002_std']], 'stdev'), + drivers = BioCro::soybean_weather[['2002']], + weight = 1 + ), + ambient_2005 = list( + data = process_table(soyface_biomass[['ambient_2005']], 'biomass'), + data_stdev = process_table(soyface_biomass[['ambient_2005_std']], 'stdev'), + drivers = BioCro::soybean_weather[['2005']], + weight = 1 + ) +) +``` + +Here we have chosen equal weights for the two years. + +## The Post-Processing Function + +The observed data includes values of the total litter, which is comprised of +both leaf and stem litter. However, the model does not calculate this quntity; +instead, it returns separate values of leaf and stem litter. To address this +issue, we can provide a "post-processing function." This is an (optional) +function that is applied to each simulation result and can be used to add new +columns. Here we define such a function, which adds a new column for the total +litter: + +```{r post_process_function} +# Define the post-processing function +post_process_function <- function(sim_res) { + # Calculate the total litter as the sum of leaf and stem litter + within(sim_res, {TotalLitter = LeafLitter + StemLitter}) +} +``` + +## The Data Definitions + +The data sets above have columns whose names do not match the corresponding +model outputs. For example, the `Leaf_Mg_per_ha` column of the observed data +must be compared to the `Leaf` column of the model output, since both represent +the leaf mass per unit ground area. To handle this mismatch, we can provide a +set of "data definitions" that specify which columns should be compared: + +```{r data_definitions} +# Define the data definition list, where the element names are columns in the +# observed data tables, and the element values are the corresponding column +# names in the model outputs +data_definitions <- list( +# Observed Simulated + CumLitter_Mg_per_ha = 'TotalLitter', + Leaf_Mg_per_ha = 'Leaf', + Root_Mg_per_ha = 'Root', + Seed_Mg_per_ha = 'Grain', + Shell_Mg_per_ha = 'Shell', + Stem_Mg_per_ha = 'Stem' +) +``` + +## The Arguments to Vary + +Here we wish to vary several parameters related to carbon partitioning for +growth, senescence, maintenance respiration, and growth respiration: + +- For each growing tissue, there are two parameters (\(\alpha\) and \(\beta\)) + that influence the parbon partitioning coefficients. Here we will vary these + for the leaf, stem, and shell (6 parameters in total). + +- For each senescing tissue, there are three parameters (\(\alpha_{sen}\), + \(\beta_{sen}\), and `rate`) that influence when senescence begins and + the overall rate of scenescence. Here we will vary these for the leaf and stem + (6 parameters in total). + +- For each growing tissue, there is one parameter (`grc`) that influences the + rate of carbon use for growth respiration. Here we will vary these for the + stem and root (2 parameters in total). + +- For each tissue, there is one parameter (`mrc`) that influences the rate of + carbon use for maintenance respiration. Here we will vary these for the leaf, + stem, and root (3 parameters in total). + +Together, this is 17 arguments to vary. Typically, an optimization problem +requires more time for each free parameter involved, so it is helpful to vary +the smallest possible set. One way to reduce the number of free parameters is to +treat some as being "dependent." In other words, to calculate the values of some +parameters from the values of others, so that only some of them are truly free +or "independent." Here we will do this by fixing the value of `mrc_stem` to the +value of `mrc_leaf`. Thus, we can think of this is a single maintenance +respiration coefficient for the entire shoot; this reduces the number of +independent parameters by one (to 16). + +The independent arguments must be specified as a list of named numeric elements, +where the name is the argument name and the value is an initial guess for that +argument. Here we will use the default Soybean-BioCro values as our initial +guesses: + +```{r independent_args} +# Define a list of independent arguments and their initial values +independent_arg_names <- c( + # Partitioning for leaf, stem, and shell + 'alphaLeaf', + 'betaLeaf', + 'alphaStem', + 'betaStem', + 'alphaShell', + 'betaShell', + + # Senescence for leaf and stem + 'alphaSeneLeaf', + 'betaSeneLeaf', + 'rateSeneLeaf', + 'alphaSeneStem', + 'betaSeneStem', + 'rateSeneStem', + + # Growth respiration for stem and root + 'grc_stem', + 'grc_root', + + # Maintenance respiration for leaf and root + 'mrc_leaf', + 'mrc_root' +) + +independent_args <- soybean$parameters[independent_arg_names] +``` + +The dependent arguments must be specified as a function that takes a list of +independent arguments as its input, and returns a list of dependent arguments as +its output: + +```{r dependent_arg_function} +# Define a function that sets `mrc_stem` to the value of `mrc_leaf` +dependent_arg_function <- function(ind_args) { + list(mrc_stem = ind_args[['mrc_leaf']]) +} +``` + +## The Quantity Weights + +When determining the error metric value, we wish to assign different weights to +each type of observed value. This can be handled via the `quantity_weights`, +which must be a list of named numeric elements, where the name of each element +is an output from the simulation, and its value is the weight. + +```{r quantity_weights} +# Specify the quantity weights; there is no systematic way to determine these, +# but the following weights have worked well in the past for Soybean-BioCro +quantity_weights <- list( + Grain = 1.0, + Leaf = 1.0, + Root = 0.1, + Shell = 0.5, + Stem = 1.0, + TotalLitter = 0.1 +) +``` + +## The Extra Penalty Function + +Sometimes an optimizer may choose parameter values that produce close agreement +with the observed data but are nevertheless unreasonable from a biological +perspective. + +To prevent these unreasonable parameters from being chosen, "extra penalties" +can be added to the error metric. These penalties can be specified using an +`extra_penalty_function`, which must take the result from a BioCro simulation +as its input and return a numeric error penalty value, which generally should be +zero (when no issues are found) or a large positive number (if an issue has been +found). + +For Soybean-BioCro parameterization, three common issues are that: + +1. Carbon is never partitioned to one or more key tissues. + +2. Carbon partitioning to the stem and leaf starts at different times. + +3. Carbon partitioning to the leaves begins too early or too late. + +The function below will return a large value when any of these situations +occurs, and will otherwise return a value of zero. + +```{r extra_penalty_function} +# Define an extra penalty function +extra_penalty_function <- function(sim_res) { + # Set the penalty value + PENALTY <- 9999 + + # Get the first times when each partitioning coefficient becomes non-zero + k_thresh <- 0.01 # Threshold k value to decide when growth has started + hpd <- 24.0 # Hours per day + + time <- sim_res[['time']] + + time_grain <- time[sim_res[['kGrain']] > k_thresh][1] + time_leaf <- time[sim_res[['kLeaf']] > k_thresh][1] + time_shell <- time[sim_res[['kShell']] > k_thresh][1] + time_stem <- time[sim_res[['kStem']] > k_thresh][1] + + # Return a penalty if necessary + if (is.na(time_grain) | is.na(time_leaf) | is.na(time_shell) | is.na(time_stem)) { + # One or more tissues is not growing + return(PENALTY) + } else if (abs(time_leaf - time_stem) > 5 * hpd) { + # The starts of leaf and stem growth are more than 5 days apart + return(PENALTY) + } else if (time_leaf - time[1] > 20 * hpd | time_leaf - time[1] < 10 * hpd) { + # The start of leaf growth is too late (more than 20 days after sowing) or + # too early (fewer than 10 days after sowing) + return(PENALTY) + } else { + # No problems were detected + return(0.0) + } +} +``` + +## The Objective Function + +Now we are just about ready to build the objective function. There are a few +more details to discuss: + +- Soybean-BioCro has always used the `'mean_max'` method for determining + normalization factors; see Equations 14-16 of @matthews_soybean_biocro_2021 + for more details. + +- Soybean-BioCro has always used the `'logarithm'` method for determining + weights from standard deviations with \(\epsilon = 10^{-5}\); see Equation 17 + of @matthews_soybean_biocro_2021 for more details. + +- Soybean-BioCro has not used any regularization. + +With this, it is possible to build the function. Note that some useful +information is printed out when the function is created, such as the full list +of observed values and their corresponding weights. + +```{r objective_function} +# Create the objective function +obj_fun <- objective_function( + base_model_definition, + data_driver_pairs, + independent_args, + quantity_weights, + data_definitions = data_definitions, + normalization_method = 'mean_max', + stdev_weight_method = 'logarithm', + stdev_weight_param = 1e-5, + regularization_method = 'none', + dependent_arg_function = dependent_arg_function, + post_process_function = post_process_function, + extra_penalty_function = extra_penalty_function +) +``` + +# Optimizing the Parameter Values + +The objective function is designed to be passed to a minimization algorithm, +which will determine the argument values that produce the best agreement between +the model predictions and the observations. + +Soybean-BioCro has already been parameterized, so there is already good +agreement between the model and the data. This can be seen by examining the +value of the error metric when using the default Soybean-BioCro values: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +default_error <- obj_fun(as.numeric(independent_args)) +``` + +This evaluates to `r default_error`. This is a low value for a Soybean-BioCro +parameterization, indicating that good agreement has already been found. + +Here, as an example, we will intentionally change each parameter value by a +small random amount, and then use an optimizer to improve the parameter values; +in an ideal world, the optimizer will eventually pick parameter values close to +the original Soybean-BioCro values. + +There are many optimizers available in R. Base R includes the `optim` function, +and others are available from the `dfoptim` and `DEoptim` packages. Here we will +use the `nmkb` optimizer from the `dfoptim` library, which requires upper and +lower bounds for each parameter and an initial guess. + +## Choosing an Initial Guess + +As mentioned above, we will intentionally choose a "bad" initial guess by +tweaking each parameter value by a small random amount. Note that we set a seed +to ensure the same result is obtained every time this is performed. Also note +that the initial guess must be a numeric vector, where the elements are ordered +as they are in `independent_args`. + +```{r initial_guess} +# Set a seed +set.seed(1234) + +# Make an initial guess by perturbing the default values by a small amount +rel_size <- 0.02 + +initial_guess <- as.numeric(independent_args) * + (1.0 + runif(length(independent_args), -rel_size, rel_size)) +``` + +Even though the changes to parameter values are small, there is still a +substantial change in the value of the error metric: + +```{r} +# Evaluate the error function with default Soybean-BioCro parameters +initial_error <- obj_fun(initial_guess) +``` + +This evaluates to `r initial_error`, which is about +`r round(100 * (initial_error - default_error) / initial_error)` percent larger +than with the default parameter values. + +## Choosing Lower and Upper Bounds + +There is not always a systematic approach to choosing lower and upper bounds for +parameter values, but the following bounds have worked well for Soybean-BioCro +in the past: + +- The \(\alpha\) parameters used in partitioning and senescence calculations are + confined to the interval [0, 50]. + +- The \(\beta\) parameters used in partitioning and senescence calculations are + confined to the interval [-50, 0]. + +- The senescence rates have a lower bound of zero, but have different upper + bounds for each tissue. + +- The maintenance respiration coefficients are confined to the interval + [1e-6, 1e-2]. + +- The growth respiration coefficients must be positive and non-zero, but have + different bounds for each tissue. + +There are many possible ways to specify the bounds in R, but ultimately they +must be expressed as numeric vectors, where the elements are ordered as they are +in `independent_args`. Here we use the `bounds_table` function from +`BioCroValidation` to create a data frame where the lower and upper bounds are +stored as columns. Later, the columns can be passed to the optimizer. The +`bounds_table` function will also check the initial guess to ensure it lies +within the bounds; for more information about this function, see its help page +by typing `?bounds_table` from an R terminal. + +```{r setting_bounds} +# Specify some bounds +aul <- 50 # Upper limit for alpha parameters +bll <- -50 # Lower limit for beta parameters +mll <- 1e-6 # Lower limit for mrc parameters +mul <- 1e-2 # Upper limit for mrc parameters + +# Define a table with the bounds in the same order as `independent_args` +bounds <- bounds_table( + independent_args, + list( + alphaLeaf = c(0, aul), + alphaStem = c(0, aul), + alphaShell = c(0, aul), + alphaSeneLeaf = c(0, aul), + alphaSeneStem = c(0, aul), + betaLeaf = c(bll, 0), + betaStem = c(bll, 0), + betaShell = c(bll, 0), + betaSeneLeaf = c(bll, 0), + betaSeneStem = c(bll, 0), + rateSeneLeaf = c(0, 0.0125), + rateSeneStem = c(0, 0.005), + mrc_leaf = c(mll, mul), + mrc_root = c(mll, mul), + grc_stem = c(8e-4, 0.08), + grc_root = c(0.0025, 0.075) + ), + initial_guess +) +``` + +## Running the Optimizer + +Now we will use an optimizer to improve on the initial guess. As mentioned +above, we will use the `nmkb` optimizer from the `dfoptim` package. This is a +good choice when a decent starting guess is known. If a broader search is +necessary, `DEoptim` from the `DEoptim` package may be more appropriate, +although it generally needs more time to run. + +To make sure this example does not take too much time, we will use a loose +tolerance; a more realistic example would probably use `1e-4` or `1e-5`. + +```{r run_optimizer, eval = FALSE} +# Run the optimizer +optim_res <- nmkb( + initial_guess, + obj_fun, + bounds[['lower']], + bounds[['upper']], + control = list( + tol = 1e-2, + restarts.max = 10 + ), + debug_mode = FALSE # Passed to obj_fun; set to TRUE to enable debug mode +) +``` + +```{r, echo = FALSE} +timing <- system.time({ +<> +}) +``` + +When this document was generated, running the optimizer required the following +amount of time: + +```{r, echo = FALSE} +print(timing) +``` + +The total time was about `r round(timing[3] / 60, 2)` minutes. For a real +parameterization problem, it can be many times longer, and may even need days to +run on a personal laptop computer. + +The optimizer also reports how many times the objective function was called, +among other details: + +```{r} +str(optim_res) +``` + +The value of `feval` is `r optim_res[['feval']]`, so on average, each call of +the objective function required approximately +`r round(timing[3] / optim_res[['feval']], 3)` seconds. + +## Comparing Parameter Values + +Let's see whether the optimized parameters are closer to the default parameters +than the initial guess was. + +```{r compare_param} +# Create a table of the various independent argument values +ind_arg_table <- data.frame( + arg_name = independent_arg_names, + defaults = as.numeric(independent_args), + initial_guess = initial_guess, + optimized = optim_res[['par']], + stringsAsFactors = FALSE +) + +# Add differences +ind_arg_table <- within(ind_arg_table, { + initial_diff = abs(initial_guess - defaults) + optimized_diff = abs(optimized - defaults) + improved = optimized_diff < initial_diff +}) + +# View results +print(ind_arg_table) +``` + +In this table, when the `improved` column is `TRUE`, this means that the +optimized parameter value is closer to the default value than the initial guess +was; in other words, it means that the optimizer truly improved on the initial +guess. In this example, `r sum(ind_arg_table[['improved']])` out of +`r nrow(ind_arg_table)` parameter estimates were improved +(`r round(100 * sum(ind_arg_table[['improved']]) / nrow(ind_arg_table))` percent). + +We can also compare the error metric to its original value. As shown above, it +is now `r optim_res[['value']]`, which is only +`r round(100 * (optim_res[['value']] - default_error) / optim_res[['value']], 1)` +percent larger than with the default parameter values. + +The optimized parameter values could likely be improved by using a more +stringent tolerance for the optimizer, but this would require more time to run. + +## Comparing Model Outputs + +Another way to evaluate the results of the optimization is to compare +simulations using the default, perturbed, and re-optimized versions of the +model. + +Following the re-parameterization, we now have new values of the independent +arguments, but this is not enough to actually run the new version of the model. +Thus, the next step is to form a new model definition by updating the values of +the default soybean model. This can be accomplished using the `update_model` +function from `BioCroValidation`: + +```{r form_new_models} +# Get model definition lists for the perturbed and re-parameterized versions of +# the soybean model +soybean_perturbed <- update_model( + BioCro::soybean, + independent_args, + initial_guess, + dependent_arg_function = dependent_arg_function +) + +soybean_reparam <- update_model( + BioCro::soybean, + independent_args, + optim_res[['par']], + dependent_arg_function = dependent_arg_function +) +``` + +We can check that the three models have different values of key parameters, such +as the "dependent" argument `mrc_stem`: + +```{r} +print(BioCro::soybean$parameters$mrc_stem) + +print(soybean_perturbed$parameters$mrc_stem) + +print(soybean_reparam$parameters$mrc_stem) +``` + +Now we can run each version of the model for a single year and visually compare +their outputs: + +```{r comparison_plots} +# Define a helper function that runs a single model for the year 2005 +run_2005 <- function(model_definition) { + with(model_definition, {run_biocro( + initial_values, + parameters, + soybean_weather[['2005']], + direct_modules, + differential_modules, + ode_solver + )}) +} + +# Run each model and combine the results +full_res <- rbind( + within(run_2005(BioCro::soybean), {model = 'Default Soybean-BioCro'}), + within(run_2005(soybean_perturbed), {model = 'Perturbed Soybean-BioCro'}), + within(run_2005(soybean_reparam), {model = 'Re-parameterized Soybean-BioCro'}) +) + +# Plot the results +xyplot( + Leaf + Stem + Root + Grain ~ fractional_doy, + group = model, + data = full_res, + type = 'l', + auto.key = list(space = 'top'), + xlab = 'Day of year (2005)', + ylab = 'Biomass (Mg / ha)' +) +``` + +Here we can see that while the simulated values for some tissues do not differ +much between the models, there are large differences in other tissues; for these +cases, the default and re-optimized versions are similar and the perturbed +version is much different. + +## Saving the New Model Definition + +A realistic parameterization takes a long time to complete, so it is important +to save the results for later use. One approach is to save the model definition +list using the `save` or `saveRDS` functions from base R. However, these options +create binary files that are not human-readable, and they cannot be easily +tracked using `git`. As an alternative, the `BioCroValidation` includes a +function called `write_model` that forms a string representing an R command that +can be called to re-create a model definition. This command string can be +written to a text file, making it easy to read and to track with `git`. + +Here we apply `write_model` to the re-optimized soybean model: + +```{r write_model_command} +# Convert the re-parameterized soybean model to an R command string +r_cmd_string <- with(soybean_reparam, write_model( + 'soybean_reparam', + direct_modules, + differential_modules, + initial_values, + parameters, + ode_solver +)) +``` + +We can view the resulting R command string: + +```{r} +writeLines(r_cmd_string) +``` + +It can also be written to a text file: + +```{r write_model_to_file, eval = FALSE} +# Save the model definition as an R file in the current working directory +writeLines(r_cmd_string, './soybean_reparam.R') +``` + +# Commands From This Document + +```{r, eval = FALSE} +### +### Preliminaries +### + +<> + +### +### Prepare inputs for `objective_function` +### + +<> + +<> + +<> + +<> + +<> + +<> + +<> + +<> + +<> + +### +### Create the objective function +### + +<> + +### +### Use an optimizer to choose parameter values +### + +<> + +<> + +<> + +### +### Check and record the new values +### + +<> + +<> + +<> + +<> + +<> + +``` + +# References diff --git a/vignettes/references.bib b/vignettes/references.bib new file mode 100644 index 0000000..4534ebb --- /dev/null +++ b/vignettes/references.bib @@ -0,0 +1,57 @@ + +@article{lochocki_biocro_2022, + author = {Lochocki, Edward B and Rohde, Scott and Jaiswal, Deepak and Matthews, Megan L and Miguez, Fernando and Long, Stephen P and McGrath, Justin M}, + title = {BioCro II: a software package for modular crop growth simulations}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diac003}, + year = {2022}, + month = {02}, + doi = {10.1093/insilicoplants/diac003}, +} + +@article{matthews_soybean_biocro_2021, + author = {Matthews, Megan L and Marshall-Colón, Amy and McGrath, Justin M and Lochocki, Edward B and Long, Stephen P}, + title = {Soybean-BioCro: a semi-mechanistic model of soybean growth}, + journal = {in silico Plants}, + volume = {4}, + number = {1}, + pages = {diab032}, + year = {2021}, + month = {12}, + doi = {10.1093/insilicoplants/diab032}, +} + +@article{he_seasonal_2023, + title = {Seasonal climate conditions impact the effectiveness of improving photosynthesis to increase soybean yield}, + journal = {Field Crops Research}, + volume = {296}, + pages = {108907}, + year = {2023}, + doi = {https://doi.org/10.1016/j.fcr.2023.108907}, + author = {Yufeng He and Megan L. Matthews}, +} + +@article{he_connecting_2024, + author = {He, Yufeng and Wang, Yu and Friedel, Douglas and Lang, Meagan and Matthews, Megan L}, + title = {Connecting detailed photosynthetic kinetics to crop growth and yield: a coupled modelling framework}, + journal = {in silico Plants}, + volume = {6}, + number = {2}, + pages = {diae009}, + year = {2024}, + month = {06}, + doi = {10.1093/insilicoplants/diae009}, +} + +@article{ordonez_root_2020, + title = {Root to shoot and carbon to nitrogen ratios of maize and soybean crops in the {US} {Midwest}}, + volume = {120}, + doi = {10.1016/j.eja.2020.126130}, + journal = {European Journal of Agronomy}, + author = {Ordóñez, Raziel A. and Archontoulis, Sotirios V. and Martinez-Feria, Rafael and Hatfield, Jerry L. and Wright, Emily E. and Castellano, Michael J.}, + month = oct, + year = {2020}, + pages = {126130}, +}