From 82820331fc50ba3d18a5aa006c971c1cfef1f4b7 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:07:51 -0400 Subject: [PATCH 01/22] add examples and vignettes --- inst/examples/1_AdverseEventKRI.R | 53 ++++ inst/examples/2_AdverseEventWorkflow.R | 125 +++++++++ inst/examples/3_ReportingWorkflow.R | 106 ++++++++ inst/examples/4_WorkflowIO.R | 92 +++++++ inst/examples/_setup.R | 2 + vignettes/.gitignore | 1 + vignettes/ContributorGuidelines.Rmd | 225 +++++++++++++++ vignettes/Cookbook.Rmd | 37 ++- vignettes/DataAnalysis.Rmd | 223 +++++++++++++++ vignettes/DataModel.Rmd | 363 ++++++++++++++++++++++++- vignettes/KRIMethod.Rmd | 223 +++++++++++++++ vignettes/gsmExtensions.Rmd | 75 ++--- 12 files changed, 1471 insertions(+), 54 deletions(-) create mode 100644 inst/examples/1_AdverseEventKRI.R create mode 100644 inst/examples/2_AdverseEventWorkflow.R create mode 100644 inst/examples/3_ReportingWorkflow.R create mode 100644 inst/examples/4_WorkflowIO.R create mode 100644 inst/examples/_setup.R create mode 100644 vignettes/ContributorGuidelines.Rmd create mode 100644 vignettes/DataAnalysis.Rmd create mode 100644 vignettes/KRIMethod.Rmd diff --git a/inst/examples/1_AdverseEventKRI.R b/inst/examples/1_AdverseEventKRI.R new file mode 100644 index 0000000..774f17b --- /dev/null +++ b/inst/examples/1_AdverseEventKRI.R @@ -0,0 +1,53 @@ +#### Example 1.1 - Generate an Adverse Event Metric using the standard {gsm.core} workflow + +dfInput <- Input_Rate( + dfSubjects= gsm.core::lSource$Raw_SUBJ, + dfNumerator= gsm.core::lSource$Raw_AE, + dfDenominator = gsm.core::lSource$Raw_SUBJ, + strSubjectCol = "subjid", + strGroupCol = "siteid", + strNumeratorMethod= "Count", + strDenominatorMethod= "Sum", + strDenominatorCol= "timeonstudy" +) + +dfTransformed <- Transform_Rate(dfInput) +dfAnalyzed <- Analyze_NormalApprox(dfTransformed, strType = "rate") +dfFlagged <- Flag_NormalApprox(dfAnalyzed, vThreshold = c(-3,-2,2,3)) +dfSummarized <- Summarize(dfFlagged) + +table(dfSummarized$Flag) + +#### Example 1.2 - Make an SAE Metric by adding a filter. Also works with pipes. + +SAE_KRI <- Input_Rate( + dfSubjects= gsm.core::lSource$Raw_SUBJ, + dfNumerator= gsm.core::lSource$Raw_AE %>% filter(aeser=="Y"), + dfDenominator = gsm.core::lSource$Raw_SUBJ, + strSubjectCol = "subjid", + strGroupCol = "siteid", + strNumeratorMethod= "Count", + strDenominatorMethod= "Sum", + strDenominatorCol= "timeonstudy" +) %>% + Transform_Rate %>% + Analyze_NormalApprox(strType = "rate") %>% + Flag_NormalApprox(vThreshold = c(-3,-2,2,3)) %>% + Summarize + +table(SAE_KRI$Flag) + +### Example 1.3 - Visualize Metric distribution using Bar Charts using provided htmlwidgets +labels <- list( + Metric= "Serious Adverse Event Rate", + Numerator= "Serious Adverse Events", + Denominator= "Days on Study" +) + +Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Metric") +Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Score") +Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Numerator") + +### Example 1.4 - Create Scatter plot with confidence bounds +dfBounds <- Analyze_NormalApprox_PredictBounds(SAE_KRI, vThreshold = c(-3,-2,2,3)) +Widget_ScatterPlot(SAE_KRI, lMetric = labels, dfBounds = dfBounds) diff --git a/inst/examples/2_AdverseEventWorkflow.R b/inst/examples/2_AdverseEventWorkflow.R new file mode 100644 index 0000000..040b500 --- /dev/null +++ b/inst/examples/2_AdverseEventWorkflow.R @@ -0,0 +1,125 @@ +#### Example 2.1 - Configurable Adverse Event Workflow + +# Define YAML workflow +AE_workflow <- read_yaml(text= +'meta: + Type: Analysis + ID: kri0001 + GroupLevel: Site + Abbreviation: AE + Metric: Adverse Event Rate + Numerator: Adverse Events + Denominator: Days on Study + Model: Normal Approximation + Score: Adjusted Z-Score + AnalysisType: rate + Threshold: -2,-1,2,3 + nMinDenominator: 30 +spec: + Mapped_AE: + subjid: + type: character + Mapped_SUBJ: + subjid: + type: character + invid: + type: character + timeonstudy: + type: integer +steps: + - output: vThreshold + name: ParseThreshold + params: + strThreshold: Threshold + - output: Analysis_Input + name: Input_Rate + params: + dfSubjects: Mapped_SUBJ + dfNumerator: Mapped_AE + dfDenominator: Mapped_SUBJ + strSubjectCol: subjid + strGroupCol: invid + strGroupLevel: GroupLevel + strNumeratorMethod: Count + strDenominatorMethod: Sum + strDenominatorCol: timeonstudy + - output: Analysis_Transformed + name: Transform_Rate + params: + dfInput: Analysis_Input + - output: Analysis_Analyzed + name: Analyze_NormalApprox + params: + dfTransformed: Analysis_Transformed + strType: AnalysisType + - output: Analysis_Flagged + name: Flag_NormalApprox + params: + dfAnalyzed: Analysis_Analyzed + vThreshold: vThreshold + - output: Analysis_Summary + name: Summarize + params: + dfFlagged: Analysis_Flagged + nMinDenominator: nMinDenominator + - output: lAnalysis + name: list + params: + ID: ID + Analysis_Input: Analysis_Input + Analysis_Transformed: Analysis_Transformed + Analysis_Analyzed: Analysis_Analyzed + Analysis_Flagged: Analysis_Flagged + Analysis_Summary: Analysis_Summary +') + +# Run the workflow +lMappingWorkflows <- MakeWorkflowList( + c("AE", "SUBJ"), + strPath = here::here("tests/testthat/testdata/mappings"), + bExact = TRUE +) +mappings_spec <- CombineSpecs(lMappingWorkflows) +lRawData <- Ingest(gsm.core::lSource, mappings_spec) +AE_data <-list( + Mapped_SUBJ= lRawData$Raw_SUBJ, + Mapped_AE= lRawData$Raw_AE +) +AE_KRI <- RunWorkflow(lWorkflow = AE_workflow, lData = AE_data) + +# Create Barchart from workflow +Widget_BarChart(dfResults = AE_KRI$Analysis_Summary) + +#### Example 2.2 - Run Country-Level Metric +AE_country_workflow <- AE_workflow +AE_country_workflow$meta$GroupLevel <- "Country" +AE_country_workflow$steps[[2]]$params$strGroupCol <- "country" + +AE_country_KRI <- RunWorkflow(lWorkflow = AE_country_workflow, lData = AE_data) +Widget_BarChart(dfResults = AE_country_KRI$Analysis_Summary, lMetric = AE_country_workflow$meta) + +#### Example 2.3 - Create SAE workflow + +# Tweak AE workflow metadata +SAE_workflow <- AE_workflow +SAE_workflow$meta$File <- "SAE_KRI" +SAE_workflow$meta$Metric <- "Serious Adverse Event Rate" +SAE_workflow$meta$Numerator <- "Serious Adverse Events" + +# Add a step to filter out non-serious AEs `RunQuery` +filterStep <- list(list( + name = "RunQuery", + output = "Mapped_AE", + params= list( + df= "Mapped_AE", + strQuery = "SELECT * FROM df WHERE aeser = 'Y'" + )) +) +SAE_workflow$steps <- SAE_workflow$steps %>% append(filterStep, after=0) + +# Run the updated workflow +SAE_KRI <- RunWorkflow(lWorkflow = SAE_workflow, lData = AE_data ) +Widget_BarChart(dfResults = SAE_KRI$Analysis_Summary, lMetric = SAE_workflow$meta) + + + diff --git a/inst/examples/3_ReportingWorkflow.R b/inst/examples/3_ReportingWorkflow.R new file mode 100644 index 0000000..5d679f7 --- /dev/null +++ b/inst/examples/3_ReportingWorkflow.R @@ -0,0 +1,106 @@ +library(gsm.core) +library(gsm.mapping) +library(gsm.kri) +library(gsm.reporting) + +#### 3.1 - Create a KRI Report using 12 standard metrics in a step-by-step workflow + +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", + "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") + +# Step 0 - Create Raw Data from Source Data +lRaw <- list( + Raw_SUBJ = gsm.core::lSource$Raw_SUBJ, + Raw_AE = gsm.core::lSource$Raw_AE, + Raw_PD = gsm.core::lSource$Raw_PD %>% + rename(subjid = subjectenrollmentnumber), + Raw_LB = gsm.core::lSource$Raw_LB, + Raw_STUDCOMP = gsm.core::lSource$Raw_STUDCOMP %>% + select(subjid, compyn), + Raw_SDRGCOMP = gsm.core::lSource$Raw_SDRGCOMP, + Raw_DATACHG = gsm.core::lSource$Raw_DATACHG %>% + rename(subject_nsv = subjectname), + Raw_DATAENT = gsm.core::lSource$Raw_DATAENT %>% + rename(subject_nsv = subjectname), + Raw_QUERY = gsm.core::lSource$Raw_QUERY %>% + rename(subject_nsv = subjectname), + Raw_ENROLL = gsm.core::lSource$Raw_ENROLL, + Raw_SITE = gsm.core::lSource$Raw_SITE %>% + rename(studyid = protocol) %>% + rename(invid = pi_number) %>% + rename(InvestigatorFirstName = pi_first_name) %>% + rename(InvestigatorLastName = pi_last_name) %>% + rename(City = city) %>% + rename(State = state) %>% + rename(Country = country) %>% + rename(Status = site_status), + Raw_STUDY = gsm.core::lSource$Raw_STUDY %>% + rename(studyid = protocol_number) %>% + rename(Status = status) +) + +# Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer +mappings_wf <- MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") +mapped <- RunWorkflows(mappings_wf, lRaw) + +# Step 2 - Create Metrics - calculate metrics using mapped data +metrics_wf <- MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") +analyzed <- RunWorkflows(metrics_wf, mapped) + +# Step 3 - Create Reporting Layer - create reports using metrics data +reporting_wf <- MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") +reporting <- RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, + lWorkflows = metrics_wf))) + +# Step 4 - Create KRI Reports - create KRI report using reporting data +module_wf <- MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") +lReports <- RunWorkflows(module_wf, reporting) + +#### 3.2 - Automate data ingestion using Ingest() and CombineSpecs() +# Step 0 - Data Ingestion - standardize tables/columns names +mappings_wf <- MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") +mappings_spec <- CombineSpecs(mappings_wf) +lRaw <- Ingest(gsm.core::lSource, mappings_spec) + +# Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer +mapped <- RunWorkflows(mappings_wf, lRaw) + +# Step 2 - Create Metrics - calculate metrics using mapped data +metrics_wf <- MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") +analyzed <- RunWorkflows(metrics_wf, mapped) + +# Step 3 - Create Reporting Layer - create reports using metrics data +reporting_wf <- MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") +reporting <- RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, + lWorkflows = metrics_wf))) + +# Step 4 - Create KRI Report - create KRI report using reporting data +module_wf <- MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") +lReports <- RunWorkflows(module_wf, reporting) + +#### 3.4 - Combine steps in to a single workflow +#ss_wf <- MakeWorkflowList(strNames = "Snapshot") +#lReports <- RunWorkflows(ss_wf, lSource) + +#### 3.4 - Use Study configuration to specify data sources +# StudyConfig <- Read_yaml("inst/workflow/config.yaml") +# mapped <- RunWorkflows(mappings_wf, lConfig=StudyConfig) +# analyzed <- RunWorkflows(metrics_wf, lConfig=StudyConfig) +# reporting <- RunWorkflows(reporting_wf, lConfig=StudyConfig) +# lReports <- RunWorkflows(module_wf, lConfig=StudyConfig) + +#### 3.3 Site-Level KRI Report with multiple SnapshotDate +# Below relies on the clindata stuff, do we need to rerun/rewrite reporting datasets? +lCharts <- MakeCharts( + dfResults = gsm.core::reportingResults, + dfGroups = gsm.core::reportingGroups, + dfMetrics = gsm.core::reportingMetrics, + dfBounds = gsm.core::reportingBounds +) + +kri_report_path <- Report_KRI( + lCharts = lCharts, + dfResults = FilterByLatestSnapshotDate(reportingResults), + dfGroups = gsm.core::reportingGroups, + dfMetrics = gsm.core::reportingMetrics +) diff --git a/inst/examples/4_WorkflowIO.R b/inst/examples/4_WorkflowIO.R new file mode 100644 index 0000000..9024811 --- /dev/null +++ b/inst/examples/4_WorkflowIO.R @@ -0,0 +1,92 @@ +load_all() + +LoadData <- function(lWorkflow, lConfig, lData = NULL) { + lData <- lData + purrr::imap( + lWorkflow$spec, + ~ { + input <- lConfig$Domains[[ .y ]] + + if (is.data.frame(input)) { + data <- input + } else if (is.function(input)) { + data <- input() + } else if (is.character(input)) { + data <- read.csv(input) + } else { + cli::cli_abort("Invalid data source: {input}.") + } + + lData[[ .y ]] <<- (ApplySpec(data, .x)) + } + ) + return(lData) +} + +SaveData <- function(lWorkflow, lConfig) { + domain <- paste0(lWorkflow$meta$Type, '_', lWorkflow$meta$ID) + cli::cli_alert_info(domain) + + if (exists(domain, lConfig$Domains)) { + output <- lConfig$Domains[[ domain ]] + cli::cli_alert_info(output) + + cli::cli_alert_info( + 'Saving output of `lWorkflow` to `{output}`.' + ) + + write.csv( + lWorkflow$lResult, + output + ) + } else { + cli::cli_alert_info( + '{domain} not found.' + ) + } +} + +lConfig <- list( + LoadData = LoadData, + SaveData = SaveData, + Domains = c( + Raw_STUDY = function() { gsm.core::lSource$Raw_STUDY }, + Raw_SITE = function() { gsm.core::lSource$Raw_SITE }, + Raw_PD = function() { gsm.core::lSource$Raw_PD }, + + Raw_SUBJ = function() { gsm.core::lSource$Raw_SUBJ }, + Raw_ENROLL = function() { gsm.core::lSource$Raw_ENROLL }, + Raw_SDRGCOMP = function() { gsm.core::lSource$Raw_SDRGCOMP }, + Raw_STUDCOMP = function() { gsm.core::lSource$Raw_STUDCOMP }, + Raw_LB = function() { gsm.core::lSource$Raw_LB }, + Raw_AE = function() { gsm.core::lSource$Raw_AE }, + + Raw_DATAENT = function() { gsm.core::lSource$Raw_DATAENT }, + Raw_DATACHG = function() { gsm.core::lSource$Raw_DATACHG }, + Raw_QUERY = function() { gsm.core::lSource$Raw_QUERY }, + + Mapped_STUDY = file.path(tempdir(), 'mapped-study.csv'), + Mapped_SITE = file.path(tempdir(), 'mapped-site.csv'), + Mapped_COUNTRY = file.path(tempdir(), 'mapped-country.csv'), + Mapped_PD = file.path(tempdir(), 'mapped-pd.csv'), + + Mapped_SUBJ = file.path(tempdir(), 'mapped-subj.csv'), + Mapped_ENROLL = file.path(tempdir(), 'mapped-enroll.csv'), + Mapped_SDRGCOMP = file.path(tempdir(), 'mapped-sdrgcomp.csv'), + Mapped_STUDCOMP = file.path(tempdir(), 'mapped-studcomp.csv'), + Mapped_LB = file.path(tempdir(), 'mapped-lb.csv'), + Mapped_AE = file.path(tempdir(), 'mapped-ae.csv'), + + Mapped_DATAENT = file.path(tempdir(), 'mapped-dataent.csv'), + Mapped_DATACHG = file.path(tempdir(), 'mapped-datachg.csv'), + Mapped_QUERY = file.path(tempdir(), 'mapped-query.csv') + ) +) + +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", + "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") + +lMappedData <- RunWorkflows( + MakeWorkflowList(strNames = core_mappings, strPath = 'workflow/1_mappings', strPackage = "gsm.mapping"), + lConfig = lConfig +) diff --git a/inst/examples/_setup.R b/inst/examples/_setup.R new file mode 100644 index 0000000..f3a2ae6 --- /dev/null +++ b/inst/examples/_setup.R @@ -0,0 +1,2 @@ +devtools::install_github('gilead-biostats/gsm.core@dev') +# or gsm.core git checkout branch and devtools::load_all() diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 097b241..9618e1a 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,2 +1,3 @@ *.html *.R +qualification.log diff --git a/vignettes/ContributorGuidelines.Rmd b/vignettes/ContributorGuidelines.Rmd new file mode 100644 index 0000000..84b7d78 --- /dev/null +++ b/vignettes/ContributorGuidelines.Rmd @@ -0,0 +1,225 @@ +--- +title: "Contributor Guidelines" +description: "This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request." +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Contributor Guidelines} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +library(gsm.core) +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Introduction + +This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request. + +## Prerequisites + +Before contributing code via a Pull Request, make sure to file an [issue](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) using one of the pre-specified issue templates. Choose the template that best categorizes what you aim to contribute, which generally can be one of the following: + +- Bugfix Issue: Fix a bug in the code +- Feature Issue: Develop a new feature +- QC Issue: Update QC framework, including documentation, qualification, automation, etc. + +Someone from the development team will decide if the issue is in scope. If so, the issue will be appropriately triaged and assigned to a core developer, or approval to submit a Pull Request associated with the submitted issue will be granted. If it is decided that the issue is out of scope or otherwise irrelevant, the issue will be closed. + +The issue templates provide comments/prompts to help ensure that all relevant information is included. When submitting issues for bug fixes or specific feature requests, it is often helpful to provide a minimal [reprex](https://www.tidyverse.org/help/#reprex), or reproducible example, to help the core developers visualize the issue. + +Suggestions or other input that might not warrant formal submission of an issue can be filed under [discussions](https://github.com/Gilead-BioStats/gsm.core/discussions), which can help facilitate discourse of specific use-cases or requests. + +## Branches + +The core branches that are used in this repository are: + +- `main`: Contains the production version of the package. +- `dev`: Contains the development version of the package. +- `fix`: Used to develop new functionality in the package. See [Development Process](#development-process) below for more details. +- `release`: Used to conduct regression testing and finalize QC documentation for a release. See [Release Process](#release-process) below for more details. + +# Development Process {#development-process} + +All code development takes place in `fix` branches. This section provides general guidance about this process flow. A detailed step-by-step workflow for code development in `fix` branches can be found in the first section of [Appendix 1](#fix-branch-workflow) below. + +Once an issue is filed and delegated to a core developer, a `fix` branch will be opened, which is where all package development related to that issue will be conducted. Each `fix` branch should be linked to one or more of the filed GitHub [issue(s)](https://github.com/Gilead-BioStats/gsm.core/issues). The issue(s) will be referenced in the naming of the `fix` branch. For example, a branch named `fix-111` addresses issue #111. Tasks related to documentation, testing, and/or qualification may also use `fix` branches and associated issues. + +In addition to the above, please also use the following general guidelines when creating a Pull Request: + +- New code should generally follow the [tidyverse style guide](https://style.tidyverse.org/), but automatic styling will be applied before each release. More details about the style guide can be found [here](#style-guide). +- Documentation should be included, using the [roxygen2](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) package. +- New functions or changes to existing functions should include updated unit tests to demonstrate branch compatibility. Core developers request that unit tests are developed using [testthat \>= v3.0.0](https://testthat.r-lib.org/). +- Please include any relevant details that will provide context for the proposed updates or new functionality. Additionally, link the Pull Request to the relevant issue(s) by using either [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword), or the `Development` section on the sidebar of the Pull Request page. +- In general, all Pull Requests should target the `dev` branch (with the exception of a release Pull Request). +- All checks and tests must be passing before merging a Pull Request to `dev`. These checks are automatically run via GitHub Actions, as described in [Appendix 3](#github-action-workflow), but you can also run them locally by calling `devtools::check()` on your `fix` branch before finalizing a Pull Request. +- During the development process, developers should verify that the `qualification-dev-check` Action runs successfully. This checks whether the qualification tests were executed successfully or failed. This can be verified by looking under the Checks tab of a given Pull Request - the status will read either "Success" or "Failure". If there are any conflicts present or new functionality is added to code that is not covered by existing qualification tests, the developer submitting the Pull Request should add this information to the version qualification issue. + +# Release Process {#release-process} + +Code release follows a process using `release` branches. A release is initiated when all feature development, QC, and qualification has been completed for a given functionality. The primary objective of the Release Workflow is to conduct regression testing and finalize all QC documentation for that release. A detailed step-by-step workflow for code release can be found in the second section of [Appendix 1](#release-branch-workflow) below. + +# Style Guide {#style-guide} + +Code developers for `{gsm}` use the [tidyverse style guide](https://style.tidyverse.org/) with minimal modifications. The code below is run to standardize styling before each release: + + double_indent_style <- styler::tidyverse_style() + double_indent_style$indention$unindent_fun_dec <- NULL + double_indent_style$indention$update_indention_ref_fun_dec <- NULL + double_indent_style$line_break$remove_line_breaks_in_fun_dec <- NULL + styler::style_dir('R', transformers = double_indent_style) + styler::style_dir('tests', recursive = TRUE, transformers = double_indent_style) + +# Appendix 1 - Detailed Workflows + +## `fix` Branch Workflow {#fix-branch-workflow} + +1. Create issue(s) defining addition(s) and/or revision(s): + - Select the appropriate [template](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) to use (should be one of the following): + - `Bugfix Issue` + - `Feature Issue` + - `QC Issue` + - Assign issue(s) to core developer(s). + - Assign milestone within issue(s). +2. Developer creates `fix` branch (with nomenclature reflecting associated issue(s)) and updates the associated code(s). +3. Developer opens Pull Request for the `fix` branch to be merged into the `dev` branch using the GitHub default Pull Request template. Developer should do the following: + - Assign Pull Request to self. + - Requests review(s). + - Assign milestone. + - Link to associated issue(s). +4. Before the `fix` branch can be merged into the `dev` branch, the Pull Request must: + - Be approved by assigned code reviewer(s). + - Pass all GitHub qualification checks. +5. `fix` branch is merged into the `dev` branch after the above requirements are fulfilled. The user who merges the `fix` branch should make sure to delete it upon merging. + +## `release` Branch Workflow {#release-branch-workflow} + +1. Release Owner creates `release` branch from `dev` branch. + - The `release` branch should be named according to the version of the package being released (e.g., `release-v1.2.0`) using [semantic versioning](https://semver.org/). + - If a release branch is already created, make sure that it is synced with the current `dev` branch. +2. Release Owner prepares the release for QC by performing the following steps and pushing updates to the `release` branch: + - Confirm that the version in the `DESCRIPTION` file is up to date. + - Run `styler` using the script from the [style guide](#style-guide) above (or by running `gutil::style_code()`) and commit any updates. + - Update `NEWS.md` with a summary of the revisions/additions in the release. Keep any information from previous releases to maintain traceability through versions. + - Ensure that the qualification specifications spreadsheet is up-to-date and accurate. If there have been any changes/updates to qualification tests, reach out to the qualification developer to update any necessary files. + - If applicable, review `README.md` and relevant vignettes to make sure updates are accurately described. + - Ensure all unit tests are passing. + - Check if all qualification tests are passing and if new features were added that need to be qualified. If updates are needed, they should be outlined in a release QC issue. + - Run `devtools::spell_check()` and resolve findings. + - Build site using `pkgdown::build_site()`. Check that all examples are displayed correctly and that all new functions occur on the Reference page. + - Open a clean R session. Run `devtools::install()` and then `devtools::check()` locally and confirm that there are no issues/conflicts. +3. Release Owner creates Pull Request from the `release` branch to the `main` branch: + - Use the [release Pull Request template](https://github.com/Gilead-BioStats/gsm.core/blob/dev/.github/PULL_REQUEST_TEMPLATE/release.md) by adding `?template=release.md` to the URL when creating the Pull Request. The user can also click the link, then click `Raw`, and copy/paste the displayed Markdown into the Pull Request. + - Assign Pull Request to self. + - Request QC review(s). + - Assign milestone. + - Complete Risk Assessments for each Assessment/Feature added as outlined in the Pull Request template. + - Create comments in the Pull Request with a unique [QC checklist](#appendix-2---qc-checklist) for each selected Assessment/Feature (See [example for v0.1.0](https://github.com/Gilead-BioStats/gsm.core/pull/194)). +4. QC Reviewer(s) conduct(s) review by: + - Completing all QC checklists in the Pull Request. + - Ensuring all GitHub Actions on the Pull Request to the `main` branch are passing. +5. QC Reviewer(s) approve(s) Pull Request or request(s) changes. If changes are needed: + - QC Reviewer(s) should file issues and the development team should follow the standard package development process using `fix` branches. + - Once issues are resolved and merged to the `dev` branch, Release Owner can merge the `dev` branch into the `release` branch, and re-request review. + - If needed, the original Pull Request can be closed and a new release Pull Request can be created with a Release Candidate (RC) value added to the branch name (e.g., `release-v1.2.0-RC2`) +6. Once the Pull Request is approved, the Release Owner should complete the release by taking the following steps: + - Merge the release Pull Request to the `main` branch. + - Create the GitHub release targeting the `main` branch using the wording from `NEWS.md`, in addition to the automatically generated content in GitHub. + - Confirm that the QC Report is attached to release. +7. Finally, the Release Owner (or qualified designee) should complete the following housekeeping tasks: + - Create a Pull Request to merge the `main` branch into the `dev` branch to sync any updates that were made during release process. + - Check that all issues associated with the current release are closed. + - Update the milestone for any incomplete tasks. + - Delete code branches associated with previous releases. + - Close the milestone and project associated with the previous release. + +# Appendix 2 - QC Checklist {#appendix-2---qc-checklist} + +This QC checklist is to be used as part of the Development and Release Workflows described above. When applied to an Assessment/Feature, confirm that each function meets the requirements described. When applied to utility or other functionality, use relevant sections of the checklist and modify QC checks as needed. A risk-based approach will be used to determine whether each release requires a high-level or detailed release QC. + +### High-Level QC Checklist + +- [ ] Documentation + - [ ] New functionality contains an `@export` tag. + - [ ] New functionality contains an adequate level of documentation. +- [ ] Error Checking + - [ ] New functionality has associated unit test(s). + - [ ] Tests confirm that the input data has required columns (if any). + - [ ] Tests confirm that the output data has expected columns/structure. + - [ ] Tests confirm intended functionality for each parameter. +- [ ] Data Model + - [ ] Running `Make_Snapshot()` using defaults returns no errors. + - [ ] Running `CheckSnapshotInputs()` using the output of `Make_Snapshot()$lSnapshot` returns no errors. + - [ ] Running `Study_Assess()` using defaults returns no errors. + - [ ] Running `Study_Report()` using the output of `Study_Assess()` from above produces a HTML report that accurately displays results. +- [ ] Basic QC + - [ ] Assessment has User Requirements + Qualification tests captured using qualification framework. A QC report is generated as expected and all checks pass successfully. + - [ ] Code is well commented and easy to read/understand. + - [ ] Qualification specifications spreadsheet (`qualification_specs.csv`) has been reviewed and approved by a qualification developer. + - [ ] No file paths or other company-specific data are present. + - [ ] `devtools::check()` passes with no errors/warnings/notes. + - [ ] Package documents are up to date, and running `devtools::document()` does not change any files. + - [ ] Codes use `{tidyverse}` best practices for standard data manipulation. If unclear, reviewer should start a discussion thread. + - [ ] All new dependencies add significant value. If unclear, reviewer should start a discussion thread. + - [ ] All GitHub Actions run with no errors. + +### Detailed QC Checklist + +- [ ] Documentation + - [ ] Function name captured in [roxygen2 title](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#the-description-block) (e.g., "Adverse Event Assessment") + - [ ] Assessment purpose captured in [roxygen2 description](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#the-description-block) (e.g., "Evaluates adverse event (AE) rates to identify sites that may be over- or under-reporting AEs") + - [ ] Input data requirements are captured in a dedicated [roxygen2 details section](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#sections) under *Data specification* (`#' @section Data specification`, or *\# Data specification* if storing data specification in a `.md` file). + - [ ] Statistical methods and assumptions are captured in a dedicated [roxygen2 details section](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#sections) under *Statistical assumptions* (`#' @section Statistical assumptions`, or *\# Statistical assumptions* if storing statistical assumptions in a `.md` file). This section should link to the relevant `Analyze_` function(s) for further details. + - [ ] All function parameters are described with a [`@param` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). Each parameter description should include its name, type, purpose, usage details, default value (if applicable), requirement, and valid options (if applicable). + - [ ] All external dependencies are captured. Use `@importFrom _package_ _function_` when importing five (5) or fewer functions, and `@import _package_` otherwise. + - [ ] Function output is captured with a [`@return` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). Each output description should include output type, structure, and data specification (if applicable). + - [ ] At least one (1) example is provided under an [`@examples` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). +- [ ] Error Checking + - [ ] Basic checks for all parameters should be included using `stopifnot()` or similar logic (e.g., `stopifnot("dfInput is not a data frame" = is.data.frame(dfInput))`) + - [ ] Tests confirm that `stopifnot()` parameter checks are working as expected. + - [ ] Tests confirm that the input data has required columns (if any). + - [ ] Tests confirm that the output data has expected columns/structure. + - [ ] Tests confirm intended functionality for each parameter. + - [ ] Tests confirm that missing data in required columns is handled appropriately and errors/warnings are produced if needed. +- [ ] Basic QC + - [ ] Assessment has User Requirements + Qualification tests captured using qualification framework. A QC report is generated as expected and all checks pass successfully. + - [ ] Code is well commented and easy to read/understand. + - [ ] Qualification specifications spreadsheet (`qualification_specs.csv`) has been reviewed and approved by a qualification developer. + - [ ] No file paths or other company-specific data are present. + - [ ] Function called from non-tidyverse dependencies are called via `::`. + - [ ] `devtools::check()` passes with no errors/warnings/notes. + - [ ] Package documents are up to date, and running `devtools::document()` does not change any files. + - [ ] Codes use `{tidyverse}` best practices for standard data manipulation. If unclear, reviewer should start a discussion thread. + - [ ] All new dependencies add significant value. If unclear, reviewer should start a discussion thread. + - [ ] All GitHub Actions run with no errors. + +# Appendix 3 - Continuous Integration with GitHub Actions {#github-action-workflow} + +GitHub Actions are used in all `{gsm}` packages to automate processes and ensure all code and documentation is created consistently and documented thoroughly. + +## Merges to `dev` Branch + +- R CMD Check (`R-CMD-check-dev`): + - Basic R CMD check which can be run using `rcmdcheck::rcmdcheck()` + - Provides an additional check for the ability to build the `pkgdown` reference index and ensure that all functions are documented correctly. This check will run on `ubuntu-latest` and on R version 4.1.3. +- Build Markdown (`build-markdown`): + - Builds Assessment Specification tables from function documentation + - Outputs are added to `man`/`.md` and any changes are committed to the compare branch or the triggering Pull Request. +- Test Coverage (`test-coverage`): + - Uses `{covr}` to check the package coverage. +- Qualification Check (`qualification-check-dev`): + - Runs the qualification tests but will not fail if any of the tests do not pass. Developers should review this check when changes that might need updates to qualification are done. + +## Merges to `main` Branch + +- R CMD Check (`R-CMD-check-main`): + - Basic R CMD check which can be run using `rcmdcheck::rcmdcheck()` + - Provides an additional check for the ability to build the `pkgdown` reference index and ensure that all functions are documented correctly. The check will also run all qualification tests to ensure that the release is fully qualified. This check will run on `ubuntu-latest` and on R version 4.1.3. Additionally, it will be run on the latest R release version on `windows-latest`, `macOS-latest`, and `ubuntu-latest`. +- `pkgdown`: + - Builds the [pkgdown site](https://gilead-biostats.github.io/gsm.core/) for the relevant package (`{gsm.core}` in this case). +- Qualification Report (`qualification-report`): + - Builds the qualification vignette as an attached artifact to the Pull Request. This should be reviewed by the Pull Request Owner for completeness and correctness to ensure that the artifact added to the release is correct. diff --git a/vignettes/Cookbook.Rmd b/vignettes/Cookbook.Rmd index 58d560f..ec20f17 100644 --- a/vignettes/Cookbook.Rmd +++ b/vignettes/Cookbook.Rmd @@ -1,5 +1,6 @@ --- title: "Cookbook" +description: "Sample code showing how to use the Good Statistical Monitoring {gsm} suite of packages using sample data from {gsm.core}." output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Cookbook} @@ -8,7 +9,7 @@ vignette: > --- ```{r setup, include = FALSE} -library(gsm) +library(gsm.core) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -17,7 +18,7 @@ knitr::opts_chunk$set( # Introduction -This vignette contains sample code showing how to use the Good Statistical Monitoring `{gsm}` package using sample data from [`{clindata}`](https://github.com/Gilead-BioStats/clindata). For more information on the `{gsm}` package see the [package homepage](https://gilead-biostats.github.io/gsm/). +This vignette contains sample code showing how to use the Good Statistical Monitoring `{gsm}` suite of packages using sample data from `{gsm.core}`. For more information on the `{gsm}` suite of packages see the [package homepage](https://gilead-biostats.github.io/gsm.core/). # Setup and Installation @@ -27,48 +28,56 @@ Run the following: ## Install devtools install.packages('devtools') -## Install and load sample raw data -devtools::install_github("Gilead-BioStats/clindata", ref = "main") -library(clindata) - ## Install and load gsm -devtools::install_github("Gilead-BioStats/gsm", ref = "main") -library(gsm) +devtools::install_github("Gilead-BioStats/gsm.core", ref = "main") +library(gsm.core) + +## Install and load gsm.mapping +devtools::install_github("Gilead-BioStats/gsm.mapping", ref = "main") +library(gsm.mapping) + +## Install and load gsm.kri +devtools::install_github("Gilead-BioStats/gsm.kri", ref = "main") +library(gsm.kri) + +## Install and load gsm.reporting +devtools::install_github("Gilead-BioStats/gsm.reporting", ref = "main") +library(gsm.kri) ``` # Example 1 - Adverse Events Metric - Scripted -This example uses the standard {gsm} analysis workflows to creates site-level Adverse Event scripts. See the [Data Analysis Vignette](https://gilead-biostats.github.io/gsm/articles/DataAnalysis.html) for more detail. +This example uses the standard {gsm} analysis workflows to creates site-level Adverse Event scripts. See the [Data Analysis Vignette](https://gilead-biostats.github.io/gsm.core/articles/DataAnalysis.html) for more detail. - **Example 1.1** calculates the Site-level AE rates. - **Example 1.2** adds a filter to include only Serious Adverse Events (SAEs) and implements pipes to run through the workflow. - **Example 1.3** generates bar charts showing SAE rates and z-scores by study. - **Example 1.4** generates a scatter plot with confidence bound for SAE rates. -```{r file = system.file("examples", "1_AdverseEventKRI.R", package = "gsm"), eval = FALSE, include = TRUE} +```{r file = system.file("examples", "1_AdverseEventKRI.R", package = "gsm.core"), eval = FALSE, include = TRUE} ``` # Example 2 - Adverse Events Metrics - Workflow -This examples introduces YAML workflows to re-generate the same results as in **Example 1** via a reusable pipeline. See the [Data Model Vignette](https://gilead-biostats.github.io/gsm/articles/DataModel.html) for more detail. +This examples introduces YAML workflows to re-generate the same results as in **Example 1** via a reusable pipeline. See the [Data Model Vignette](https://gilead-biostats.github.io/gsm.core/articles/DataModel.html) for more detail. - **Example 2.1** runs the AE KRI workflow. - **Example 2.2** updates the metadata to run country-level metrics. - **Example 2.3** adds a filtering step to the workflow to generate the SAE metric. -```{r file = system.file("examples", "2_AdverseEventWorkflow.R", package = "gsm"), eval = FALSE, include = TRUE} +```{r file = system.file("examples", "2_AdverseEventWorkflow.R", package = "gsm.core"), eval = FALSE, include = TRUE} ``` # Example 3 - Study-Level Reporting Workflows -This example extends the previous examples to generate charts and reports for multiple KRIs. See the [Data Reporting Vignette](https://gilead-biostats.github.io/gsm/articles/DataReporting.html) for more detail. +This example extends the previous examples to generate charts and reports for multiple KRIs. See the [Data Reporting Vignette](https://gilead-biostats.github.io/gsm.reporting/articles/DataReporting.html) for more detail. - **Example 3.1** steps through several workflows to generate a report for all 12 standard site-level KRIs. - **Example 3.2** automates data ingestion using `Ingest()` and `CombineSpecs()`. - **Example 3.3** generates a report incorporating multiple timepoints using the sample `reporting` data saved as part of {gsm}. -```{r file = system.file("examples", "3_ReportingWorkflow.R", package = "gsm"), eval = FALSE, include = TRUE} +```{r file = system.file("examples", "3_ReportingWorkflow.R", package = "gsm.core"), eval = FALSE, include = TRUE} ``` diff --git a/vignettes/DataAnalysis.Rmd b/vignettes/DataAnalysis.Rmd new file mode 100644 index 0000000..c3bda73 --- /dev/null +++ b/vignettes/DataAnalysis.Rmd @@ -0,0 +1,223 @@ +--- +title: "Step-by-Step Analysis Workflow" +description: "This vignette walks users through the mechanics of the functions that produce all of the Analysis workflow output within the `{gsm.core}` package." +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Step-by-Step Analysis Workflow} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +library(gsm.core) +library(gt) +library(DT) +``` + +# Introduction + +This vignette walks users through the mechanics of the functions that produce all of the Analysis workflow output within the `{gsm.core}` package. The suite of `{gsm}` packages leverages Key Risk Indicators (KRIs) and thresholds to conduct study-level and site-level Risk Based Monitoring for clinical trials. + +These functions provide data frames, visualizations, and metadata to be used in reporting and error checking at clinical sites. The image below illustrates the supporting functions that feed into the yaml workflow that is specified in each analysis workflow. + +![](data_analysis.png){width="100%"} + +All of these functions will run automatically and sequentially when a user calls upon the `RunWorkflow()` function with a specified yaml file for KRI metrics found in the `workflow/2_metrics` directory of the [`{gsm.kri}`](https://github.com/Gilead-BioStats/gsm.kri) package. + +Each of these individual functions can also be run independently outside of a specified yaml workflow. + +For the purposes of this documentation, we will evaluate the input(s) and output(s) of each individual function for a specific KRI to show the stepwise progression of how a yaml workflow is set up to handle and process data. + +------------------------------------------------------------------------ + +## Case Study - Step-by-Step Adverse Event KRI + +We will use sample clinical data simulated with the [`{gsm.datasim}`](https://github.com/Gilead-BioStats/gsm.datasim) package to run the Adverse Events (AE) Assessment, i.e., `AE_Assess()`, using the normal approximation method. + +Additional statistical methods and supporting functions are explored in [Appendix 1](#appendix-1). + +### 1. Create `dfInput` + +Start by creating `dfInput` using sample rawplus data created with `{gsm.datasim}`. Note that `Input_Rate()` requires three specific clinical datasets, which include a subject-level demographics/exposure dataset (`dfSubjects`) and a domain-level dataset (`dfNumerator`) that records every adverse event per subject. + +Since `Input_Rate()` is a generalized function, it is also required that you specify the relevant column names for the Subject (`strSubjectCol`), Group (`strGroupCol`) and optionally the Denominator (`strDenominatorCol`) and Numerator (`strNumeratorCol`) when it is not simply "Denominator" or "Numerator", respectively. + +Finally, the method for calculating the Numerator and Denominator is specified in `strNumeratorMethod` and `strDenominatorMethod` as either "Count" or "Sum". If the method is "Count", the function simply counts the number of rows in the provided data frame. If the numerator method is "Sum", the function takes the sum of the values in the specified column (`strNumeratorCol` or `strDenominatorCol`). + + +```{r include = TRUE} +dfInput <- Input_Rate( + dfSubjects = gsm.core::lSource$Raw_SUBJ, + dfNumerator = gsm.core::lSource$Raw_AE, + dfDenominator = gsm.core::lSource$Raw_SUBJ, + strSubjectCol = "subjid", + strGroupCol = "siteid", + strNumeratorMethod = "Count", + strDenominatorMethod = "Sum", + strDenominatorCol = "timeonstudy" +) +``` + +The data frame `dfInput` for an AE assessment will be created by running `Input_Rate()` and will have one record per subject, with the following columns: + +- `SubjectID`: Subject Identifier +- `GroupID`: Group Identifier +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site) +- `Numerator`: Total Time on Treatment (measured in days; per subject) +- `Denominator`: Total Number of Event(s) of Interest (in this example, the number of AEs reported; per subject) +- `Metric`: Rate of Event Incidence (calculated as `Exposure`/`Count`; per subject) + +```{r echo = FALSE} +datatable(dfInput) %>% + formatRound(columns = "Metric", digits = 3) +``` + +------------------------------------------------------------------------ + +### 2. Create `dfTransformed` + +The data frame `dfTransformed` is derived from `dfInput` using a `Transform()` function. In our example, the analysis pipeline pulls in `Transform_Rate()` since the default metric for AEs is the number of AEs reported over the course of treatment per site, i.e., a rate. + +```{r include = TRUE} +dfTransformed <- Transform_Rate(dfInput) +``` + +The resulting `dfTransformed` data frame will contain site-level transformed data, including KRI calculation. Using our example AE data, `dfTransformed` contains the following columns: + +- `GroupID`: Group Identifier (default is Site ID) +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site) +- `Numerator`: Cumulative Number of Event(s) of Interest (in this example, number of AEs reported across subjects) +- `Denominator`: Cumulative Time on Treatment (in days, across subjects) +- `Metric`: Rate of Event(s) of Interest (in this example, number of AEs reported over the course of treatment in days) + +```{r, echo = FALSE} +datatable(dfTransformed) %>% +formatRound(columns = "Metric", digits = 3) +``` + +------------------------------------------------------------------------ + +### 3. Create `dfAnalyzed` + +The data frame `dfAnalyzed` is derived from `dfTransformed` using an `Analyze()` function, which incorporates a specific statistical model. The resulting `dfAnalyzed` data frame will contain site-level analysis results data. The normal approximation method is the default statistical model for AE data, so the analysis pipeline automatically runs `Analyze_NormalApprox()`. + +```{r include = TRUE} +dfAnalyzed <- Analyze_NormalApprox(dfTransformed) +``` + +Using our example AE data, `dfAnalyzed` contains the following columns: + +- `GroupID`: Group Identifier (default is Site ID) +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site) +- `Numerator`: Cumulative Number of Event(s) of Interest (in this example, number of AEs reported across subjects); Carried from `dfTransformed`. +- `Denominator`: Cumulative Time on Treatment (in days, across subjects); Carried from `dfTransformed`. +- `Metric`: Rate of Event(s) of Interest (in this example, number of AEs reported over the course of treatment in days); Carried from `dfTransformed`. +- `OverallMetric`: Aggregate metric for the group that is being assessed. ( sum(Numerator) / sum(Denominator) ). +- `Factor`: Calculated over-dispersion adjustment factor (mean of the z-score sum of squares calculated in the analysis functions). +- `Score`: Calculated Residual (per site). + +```{r, echo = FALSE} +datatable(dfAnalyzed) %>% + formatRound(columns = c("Metric", "OverallMetric", "Factor", "Score"), digits = 3) +``` + +------------------------------------------------------------------------ + +### 4. Create `dfFlagged` + +The data frame `dfFlagged` is derived from `dfAnalyzed` using the `Flag()` function. The resulting `dfFlagged` data frame will contain site-level analysis results data with flagging incorporated based on a pre-specified statistical threshold to highlight possible outliers. + +```{r include = TRUE} +dfFlagged <- Flag(dfAnalyzed, vThreshold = c(-3, -2, 2, 3)) +``` + +The default flagging function for the normal approximation method is `Flag()` and the default threshold is (-3, -2, 2, 3). Using our example AE data, `dfFlagged` contains the following columns: + +- `GroupID`: Group Identifier (default is Site ID) +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site) +- `Numerator`: Cumulative Number of Event(s) of Interest (in this example, number of AEs reported across subjects); Carried from `dfAnalyzed` +- `Denominator`: Cumulative Time on Treatment (in days, across subjects); Carried from `dfAnalyzed` +- `Metric`: Rate of Event(s) of Interest (in this example, number of AEs reported over the course of treatment in days); Carried from `dfAnalyzed` +- `OverallMetric`: Aggregate metric for the group that is being assessed. ( sum(Numerator) / sum(Denominator) ). +- `Factor`: Calculated over-dispersion adjustment factor (mean of the z-score sum of squares calculated in the analysis functions); Carried from `dfAnalyzed`. +- `Score`: Calculated Residual (per site); Carried from `dfAnalyzed` +- `Flag`: Flag Indicating Possible Statistical Outliers; Valid values for this variable include -2, -1, 0, 1, and 2, which determine the "extremeness" of the outlier. -2 and 2 represent more extreme outliers, -1 and 1 represent less extreme outliers, and 0 represents a non-outlier. + +```{r, echo = FALSE} +datatable(dfFlagged) +``` + +------------------------------------------------------------------------ + +### 5. Create `dfSummary` + +The data frame `dfSummary` is derived from `dfFlagged` using the `Summarize()` function. The resulting `dfSummary` data frame will contain the most relevant columns from `dfFlagged` with data sorted in a meaningful way to provide a concise overview of the assessment. Flagged sites will sort earlier than non-flagged sites, with the more "extreme" outliers displayed first. The columns in `dfSummary` include: + +- `GroupID`: Group Identifier (default is Site ID) +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site) +- `Numerator`: Cumulative Number of Event(s) of Interest (in this example, number of AEs reported across subjects); Carried from `dfAnalyzed` +- `Denominator`: Cumulative Time on Treatment (in days, across subjects); Carried from `dfAnalyzed` +- `Metric`: Rate of Event(s) of Interest (in this example, number of AEs reported over the course of treatment in days) +- `Score`: Calculated Residual (per site) +- `Flag`: Flag Indicating Possible Statistical Outliers; Valid values for this variable include -2, -1, 0, 1, and 2, which determine the "extremeness" of the outlier. -2 and 2 represent more extreme outliers, -1 and 1 represent less extreme outliers, and 0 represents a non-outlier. + +```{r include = TRUE} +dfSummary <- Summarize(dfFlagged) +``` + +```{r, echo = FALSE} +datatable(dfSummary[-1,]) +``` + +------------------------------------------------------------------------ + +# Recap - Normal Approximation Adverse Event KRI + +- `dfInput` used as original input using `Input_Rate()` +- `dfTransformed` created from `dfInput` using `Transform_Rate()` +- `dfAnalyzed` created from `dfTransformed` using `Analyze_NormalApprox()` +- `dfFlagged` created from `dfAnalyzed` using `Flag_NormalApprox()` +- `dfSummary` created from `dfFlagged` using `Summarize()` + +------------------------------------------------------------------------ + +# Appendix 1 - Supporting Functions {#appendix-1} + +The following sections include various examples of supporting functions and statistical models that can be employed in the Analysis workflow. Please note that this is **not** an exhaustive list, but includes some of the most commonly called upon functions. + +### Mapping Functions + +- `RunQuery()`: Run a SQL query to create new data.frames with filtering and column name specifications. +- `Input_Rate()`: Calculate a subject level rate from raw numerator and denominator data + +### Transform Functions + +- `Transform_Rate()`: Calculates cumulative rate of Event(s) of Interest per site +- `Transform_Count()`: Calculates cumulative number of Event(s) of Interest per site + +### Analyze Functions + +- `Analyze_NormalApprox()`: Uses funnel plot method with normal approximation to create analysis results for percentage/rate. +- `Analyze_Fisher()`: Uses Fisher's Exact Test to determine if there are non-random associations between a site and a given KRI +- `Analyze_Identity()`: Used in the data pipeline between `Transform()` and `Flag()` functions to rename KRI and Score columns +- `Analyze_Poisson()`: Uses a Poisson model to describe the distribution of events in the overall site population, i.e., determine how many times an event is likely to occur at a site over a specified treatment period + +### Flag Functions + +- `Flag()`: Default flagging function for all assessments +- `Flag_NormalApprox()`: Deprecated flagging function when `Analyze_NormalApprox()` is used for an assessment. +- `Flag_Poisson()`: Deprecated flagging function when `Analyze_Poisson()` is used for an assessment + + + +### What Statistical Models Are Available For Each Assessment? + +- By default, all yaml workflow assessments specified in the `inst/workflow/` directory of the `{gsm.kri}` package use the [normal approximation](https://gilead-biostats.github.io/gsm.core/articles/KRI%20Method.html#the-normal-approximation-method) method. +- Optionally, other statistical methods include: [**Poisson**](https://gilead-biostats.github.io/gsm.core/articles/KRI%20Method.html#the-poisson-regression-method), [**Fisher's Exact**](https://gilead-biostats.github.io/gsm.core/articles/KRI%20Method.html#the-fishers-exact-method), and [**Identity**](https://gilead-biostats.github.io/gsm.core/articles/KRI%20Method.html#the-identity-method). + +![](data_analysis_combined.png){width="100%"} diff --git a/vignettes/DataModel.Rmd b/vignettes/DataModel.Rmd index 58572be..233cd34 100644 --- a/vignettes/DataModel.Rmd +++ b/vignettes/DataModel.Rmd @@ -1,5 +1,6 @@ --- title: "Data Model" +description: "A vignette detailing the data model used in the gsm pipeline." output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Data Model} @@ -7,13 +8,369 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r setup, include = FALSE} +library(gsm.core) +library(gt) +library(dplyr) + knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` -```{r setup} -library(openrbqm) +# Introduction + +The `{gsm}` suite of packages provides a standardized data pipeline for conducting study-level Risk Based Quality Management (RBQM) for clinical trials. There are four main types of data used in the `{gsm}` suite of packages: + +- **Raw Data** - Clinical and operational data from study databases +- **Mapped Data** - Data that has been transformed and standardized for analysis +- **Analysis Data** - Data that has been analyzed to calculate Key Risk Indicators (KRIs) +- **Reporting Data** - Data that has been summarized and formatted for reporting + +This vignette provides a high-level overview of how each type of data is used, and includes detailed data specifications as appendices. + +# Data Model Overview + +In general, the `{gsm}` suite of packages is designed to be flexible and customizable, allowing users build custom data pipelines that support many types of raw study data. As shown below, raw clinical data is transformed into mapped data, which is then analyzed to calculate desired metrics. The analysis data is then combined and formatted for reporting with additional raw data, including CTMS data and `{gsm.mapping}` workflow data, which provides relevant metadata for reports. + +![](data_model_simple.png){width="100%"} + +# Raw and Mapped Data + +The `{gsm}` suite of packages is designed to work with a wide variety of clinical data sources. The raw data used in the analysis pipeline is typically sourced from clinical trial databases and is transformed into mapped data using simple transformations. Mapped data is then used as input for the analysis pipeline. + +There is not a single data standard for raw or mapped data in `{gsm.mapping}`. The only requirement is that the mapped data is compatible with the analytics pipeline. Data Mapping transformations can be done using multiple methods including custom R scripts (e.g., with `dplyr`), SQL queries, or using `gsm.mapping` workflows (e.g. the `system.file("workflow/1_mapping/AE.yaml", package = "gsm.mapping")` file). Examples of these methods can be found in `vignette("Cookbook")`. + +# Analysis Data + +In `{gsm.kri}` analysis data is used to capture key metrics associated with the conduct of a clinical trial. As described in the `{gsm.kri}` readme, 12 standard Key Risk Indicator (KRI) metrics are included in the package along with automated workflows that allow them to be run for all sites or countries in a study. Examples of KRIs include the rate of adverse events or amount of missing data at a site or across sites. Defining and deploying KRIs during continually monitor risks to the integrity of the trial and take corrective actions accordingly. + +![](data_analysis.png){width="100%"} + +The image above provides an overview of the default KRI analysis pipeline. The pipeline is a standardized five-step process for assessing data issues by going from participant-level input data to a standardized site-level summary of model results. The functions used in each step of the data pipeline along with the input and output datasets are described in more detail below. + +1. `dfInput`: Input data; Cross-domain participant-level input data with all needed data for KRI derivation. Created by the `Input_Rate()` function used `mapped` data as input. +2. `dfTransformed`: Transformed data; Site-level transformed data including KRI calculation. Created by `Transform_*()` functions using `dfInput` as input. +3. `dfAnalyzed`: Analyzed data; Site-level analysis results data. Created by `Analyze_*()` functions using `dfTransformed` as input. +4. `dfFlagged`: Flagged data; Site-level analysis results with flags added to indicate potential statistical outliers. Created by passing numeric thresholds to a `Flag_*()` function using `dfAnalyzed` as input. +5. `dfSummary`: Summary data; Standardized subset of the flagged data. This summary data has the same structure for all assessments and always includes both KRI and Flag values so that a user can easily look at trends for any given site across multiple assessments. Created using the `Summarize()` function using `dfFlagged` as input. + +The data requirements for each component of the analysis pipeline are rigid; See [Appendix 1](#Appendix-1-Data-Model) for full specifications. + +## Analysis Workflows +Since there are rigid data requirements for each component of the analysis data model, the analysis workflow is largely standardized. There are two main approaches to running the analysis workflow: + +1. **Scripted Analysis**: Run each step of the analysis pipeline individually using the functions provided in the `{gsm}` suite of packages. This approach is useful for understanding the data requirements and for debugging. See Example 1 in `vignette("Cookbook")` for an example of this approach. +2. **Workflow Analysis**: Run the analysis pipeline using a YAML workflow file. This approach is useful for running the same analysis on multiple studies or for automating the analysis process. See Example 2 in `vignette("Cookbook")` for an example of this approach. + +Note that each step in these workflows can be customized based on the requirements for a specific KRI. The graphic below shows four such workflows. + +![](data_analysis_combined.png){width="100%"} + +More details about analysis data pipelines can be found in `vignette("DataAnalysis")`. + +# Reporting Data + +A rigid Reporting Data framework is provided in `{gsm.reporting}` to allow for standardized reporting, visualization and meta-analysis that compare risk profiles across timepoints, and even across multiple studies. The Reporting Data sets used in `{gsm.reporting}` and `{gsm.kri}` are: + +1. `Reporting_Results`: Summary data; Standardized subset of the flagged data. This summary data has the same structure for all assessments and always includes both KRI and Flag values so that a user can easily look at trends for any given site across multiple assessments. Created using the `Summarize()` function in the analytics pipeline, followed by the `BindResults()` function to add columns necessary for reporting and stack metrics and snapshots into a single data.frame. +3. `Reporting_Bounds`: Bounded data; A data.frame containing predicted boundary values with upper and lower bounds across the range of observed values. Created with the `MakeBounds()` function. +4. `Reporting_Groups`: Grouped data; Long data.frame of summarized group CTMS data with site, study, and country level counts and metrics. Constructed by binding data.frames created with `MakeLongMeta()`. +5. `Reporting_Metrics`: Metric metadata; Metric-specific metadata for use in charts and reporting. Created by passing an `lWorkflow` object to the `MakeMetric()` function. + +Similar to Analysis Workflows, reporting data pipelines can be run as R scripts or as YAML workflows. Example 3 in the cookbook vignette shows how to populate the Reporting Data tables using output from the Analysis Workflows and other study data sources. The Reporting Deep Dive Vignette provides more details on the Reporting Data model. + +# Appendix 1 - Data Model + +# Overview +![](data_model_detailed.png){width="100%"} + +# Analytics data model + +The KRI analytics pipeline is a standardized process for **Analyzing** data issues by going from participant-level `input` data to a standardized site-level `summary` of model results. The data sets used in each step of the data pipeline are described in detail below. When using a metric workflow YAML to create these tables, all data tables are contained in a list, which we call `lAnalysis`. This list is then fed into the reporting data pipeline. + +## Analysis Data Tables + +### `Analysis_Input` + - Function(s) used to create table: + - `gsm.core::Input_Rate()` + - Inputs: + - `Analysis_Subjects` + - `Analysis_Numerator` + - `Analysis_Denominator` + - Usage: The base data.frame for all Analysis workflows. Feeds into the `Transform_*()` functions. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| + | Analysis_Input | SubjectID | The subject ID | Character| | + | Analysis_Input | GroupID | The group ID for the metric | Character| | | + | Analysis_Input | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Analysis_Input | Numerator | The calculated numerator value | Numeric | | + | Analysis_Input | Denominator | The calculated denominator value | Numeric | | + | Analysis_Input | Metric | The calculated rate/metric value | Numeric | | + + +### `Analysis_Transformed` + - Function(s) used to create table: + - `gsm.core::Transform_Rate()` + - `gsm.core::Transform_Count()` + - Inputs: `Analysis_Input` + - Usage: Convert from input data format to needed format to derive KRI for an Assessment via the `Analyze_*()` functions. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| + | Analysis_Transformed | GroupID | The group ID for the metric | Character| | | + | Analysis_Transformed | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Analysis_Transformed | Numerator | The calculated numerator value | Numeric | | + | Analysis_Transformed | Denominator | The calculated denominator value | Numeric | | + | Analysis_Transformed | Metric | The calculated rate/metric value | Numeric | | + +### `Analysis_Analyzed` + - Function(s) used to create table: + - `gsm.core::Analyze_Fisher()` + - `gsm.core::Analyze_Identity()` + - `gsm.core::Analyze_NormalApprox()` + - `gsm.core::Analyze_Poisson()` + - Inputs: `Analysis_Transformed` + - Usage: Prepare the data for `Flag_*()` by performing the specified test on the metric provided. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| + | Analysis_Analyzed | GroupID | The group ID for the metric | Character| | | + | Analysis_Analyzed | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Analysis_Analyzed | Numerator | The calculated numerator value | Numeric | | + | Analysis_Analyzed | Denominator | The calculated denominator value | Numeric | | + | Analysis_Analyzed | Metric | The calculated rate/metric value | Numeric | | + | Analysis_Analyzed | Score | The Statistical Score | Numeric | | + | Analysis_Analyzed | Overall Metric | | Numeric |* | + | Analysis_Analyzed | Factor | | Numeric |* | + | Analysis_Analyzed | Predicted Count | | Numeric |* | + + +### `Analysis_Flagged` + - Function(s) used to create table: + - `gsm.core::Flag()` + - Inputs: `Analysis_Analyzed` + - Usage: Flag a group-level metric to be summarized via `gsm.core::Summarize()` and used for reporting. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| + | Analysis_Flagged | GroupID | The group ID for the metric | Character| | | + | Analysis_Flagged | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Analysis_Flagged | Numerator | The calculated numerator value | Numeric | | + | Analysis_Flagged | Denominator | The calculated denominator value | Numeric | | + | Analysis_Flagged | Metric | The calculated rate/metric value | Numeric | | + | Analysis_Flagged | Score | The Statistical Score | Numeric | | + | Analysis_Flagged | Flag | The ordinal Flag to be applied | Numeric | | + | Analysis_Flagged | Overall Metric | | Numeric |* | + | Analysis_Flagged | Factor | | Numeric |* | + | Analysis_Flagged | Predicted Count | | Numeric |* | + +### `Analysis_Summary` + - Function(s) used to create table: + - `gsm.core::Summarize()` + - Inputs: `Analysis_Flagged` + - Usage: Summarize KRI at the group level for reporting. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| + | Analysis_Summary | GroupID | The group ID for the metric | Character| | | + | Analysis_Summary | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Analysis_Summary | Numerator | The calculated numerator value | Numeric | | + | Analysis_Summary | Denominator | The calculated denominator value | Numeric | | + | Analysis_Summary | Metric | The calculated rate/metric value | Numeric | | + + +# Overview of Reporting data model + +## Reporting Data Tables + +### `Reporting_Results` + - Function(s) used to create table: + - `gsm.reporting::BindResults()` + - Inputs: `lAnalysis`, `strStudyID`, `dSnapshotDate` + - Workflow used to create table: `3_reporting/Results.yaml` in `{gsm.reporting}` + - Usage: Summarize KRI at the group level for reporting. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--|-----------| + | Reporting_Results | GroupID | The group ID for the metric | Character| | + | Reporting_Results | GroupLevel | The group type for the metric (e.g. "Site") | Character| | + | Reporting_Results | Numerator | The calculated numerator value | Numeric | | + | Reporting_Results | Denominator | The calculated denominator value | Numeric | | + | Reporting_Results | Metric | The calculated rate/metric value | Numeric | | + | Reporting_Results | Score | The calculated metric score | Numeric | | + | Reporting_Results | Flag | The calculated flag | Numeric | | + | Reporting_Results | MetricID | The Metric ID | Character| * | + | Reporting_Results | StudyID | The Study ID | Character| * | + | Reporting_Results | SnapshotDate | The Date of the snapshot | Date | * | + +### `Reporting_Bounds` + - Function(s) used to create table: + - `gsm.reporting::MakeBounds()` + - `gsm.core::Analyze_NormalApprox_PredictBounds()` (called within `gsm.reporting::MakeBounds()`) + - Inputs: `lAnalysis`, `strStudyID`, `dSnapshotDate` + - Workflow used to create table: `3_reporting/Bounds.yaml` in `{gsm.reporting}` + - Usage: Calculates predicted percentages/rates and upper- and lower-bounds across the full range of sample sizes/total exposure values for reporting. + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--|----------| + | Reporting_Bounds | Threshold | The number of standard deviations that the upper and lower bounds are based on | Numeric| | + | Reporting_Bounds | Denominator | The calculated denominator value | Numeric | | + | Reporting_Bounds | LogDenominator | The calculated log denominator value | Numeric | | + | Reporting_Bounds | Numerator | The calculated numerator value | Numeric | | + | Reporting_Bounds | Metric | The calculated rate/metric value | Numeric | | + | Reporting_Bounds | MetricID | The Metric ID | Character| | + | Reporting_Bounds | StudyID | The Study ID | Character| | + | Reporting_Bounds | SnapshotDate | The Date of the snapshot | Date | | + +### `Reporting_Groups` + - Function(s) used to create table: + - `gsm.reporting::MakeLongMeta()` + - `dplyr::bind_rows()` + - Inputs: CTMS site, study and country data + - Workflow used to create table: `3_reporting/Groups.yaml` in `{gsm.reporting}` + - Usage: Group-level metadata dictionary. + - Structure: Long data frame, with certain `Param` required for given `GroupLevel` + + +| Table | Column | Description |Type | Optional | +|-------------|-----------------------|-----------------------------------|----------|-----------| +| Reporting_Groups | GroupID | Unique Group ID | Character| | +| Reporting_Groups | GroupLevel | Group Level (e.g. Site, Country) | Character| | +| Reporting_Groups | Param | Parameter Name (e.g. "Status") | Character| | +| Reporting_Groups | Value | Parameter Value (e.g. "Active") | Character| | + +Expected `Param` by `GroupLevel` for use in gsm reporting. User may add other Param values as needed. + +| GroupLevel | Param | Description |Value Type | +|--------------|----------------------|-----------------------------------|----------| +| Study | Status | Study Status | Character| +| Study | ParticipantCount | # of Enrolled Participants | Numeric | +| Study | SiteCount | # of Activated Sites | Numeric| +| Site | ParticipantCount | # of Enrolled Participants | Numeric | +| Site | Status | Site Status | Character | +| Site | InvestigatorFirstName | Investigator First name | Character | +| Site | InvestigatorLastName | Investigator Last name | Character | +| Site | City | City | Character| +| Site | State | State | Character | +| Site | Country | Country | Character | +| Country | EnrolledParticipants | # of Enrolled Participants | Numeric | + + +### `Reporting_Metrics` + - Function used to create table: `gsm.reporting::MakeMetric()` + - Inputs: `lWorkflows` - metadata for the corresponding kri(s) made with `gsm.core::MakeWorkflowList()` + - Workflow used to create table: `3_reporting/Metrics.yaml` in `{gsm.reporting}` + - Usage: Metadata used for charts and tables + - Structure: + + | Table | Column Name | Description | Type | Optional | + |----------|--------------|--------------------------------------|----------|--| -------------- | + | Reporting_Metrics| File | The YAML file for workflow | Character| | + | Reporting_Metrics| MetricID | ID for the Metric | Character| | + | Reporting_Metrics| Group | The group type for the metric (e.g. "Site") | Character| | + | Reporting_Metrics| Abbreviation | Abbreviation for the metric | Character| | + | Reporting_Metrics| Metric | Name of the metric | Character| | + | Reporting_Metrics| Numerator | Data source for the Numerator | Character| | + | Reporting_Metrics| Denominator | Data source for the Denominator | Character| | + | Reporting_Metrics| Model | Model used to calculate metric | Character| | + | Reporting_Metrics| Score | Type of Score reported | Character| | + +## Appendix 2 - Analysis Workflow Specifications + +Assessment workflow metadata objects are passed to the `lWorkflow` parameter in `RunWorkflow()` to define functions and parameters across multiple studies. + +The `lWorkflow` object is a named list of metadata and steps defining how each assessment should be run. By default, `gsm.core::MakeWorkflowList()` imports YAML specifications from `workflow/2_metrics` in `{gsm.kri}`. Each item in `lWorkflow` expects the following parameters in the `steps` section: + +- `workflow`: Array defining one or more functions to be executed as part of the workflow for a given assessment + - `workflow[]$meta`: specifies all of the metadata information for the KRI. + - `workflow[]$steps`: specifies all of the steps in the workflow. + - `workflow[]$steps$name`: name of the `{gsm}` function. + - `workflow[]$steps$inputs`: specifies the required input data + - `workflow[]$steps$output`: specifies the output data from the workflow step, which can be used as an input in the next step in the workflow + - `workflow[]$steps$params`: specifies parameters to be passed to the function + +For example, the default workflow for the AE assessment (`system.file("workflow/2_metrics/kri0001.yaml", package = "gsm.kri")`) is shown below: + +```{yaml eval = FALSE} +meta: + Type: Analysis + ID: kri0001 + GroupLevel: Site + Abbreviation: AE + Metric: Adverse Event Rate + Numerator: Adverse Events + Denominator: Days on Study + Model: Normal Approximation + Score: Adjusted Z-Score + AnalysisType: rate + Threshold: -2,-1,2,3 + nMinDenominator: 30 +spec: + Mapped_AE: + subjid: + type: character + Mapped_SUBJ: + subjid: + type: character + invid: + type: character + timeonstudy: + type: integer +steps: + - output: vThreshold + name: gsm.core::ParseThreshold + params: + strThreshold: Threshold + - output: Analysis_Input + name: gsm.core::Input_Rate + params: + dfSubjects: Mapped_SUBJ + dfNumerator: Mapped_AE + dfDenominator: Mapped_SUBJ + strSubjectCol: subjid + strGroupCol: invid + strGroupLevel: GroupLevel + strNumeratorMethod: Count + strDenominatorMethod: Sum + strDenominatorCol: timeonstudy + - output: Analysis_Transformed + name: gsm.core::Transform_Rate + params: + dfInput: Analysis_Input + - output: Analysis_Analyzed + name: Analyze_NormalApprox + params: + dfTransformed: Analysis_Transformed + strType: AnalysisType + - output: Analysis_Flagged + name: gsm.core::Flag_NormalApprox + params: + dfAnalyzed: Analysis_Analyzed + vThreshold: vThreshold + - output: Analysis_Summary + name: gsm.core::Summarize + params: + dfFlagged: Analysis_Flagged + nMinDenominator: nMinDenominator + - output: lAnalysis + name: list + params: + ID: ID + Analysis_Input: Analysis_Input + Analysis_Transformed: Analysis_Transformed + Analysis_Analyzed: Analysis_Analyzed + Analysis_Flagged: Analysis_Flagged + Analysis_Summary: Analysis_Summary + + ``` diff --git a/vignettes/KRIMethod.Rmd b/vignettes/KRIMethod.Rmd new file mode 100644 index 0000000..ec1bd25 --- /dev/null +++ b/vignettes/KRIMethod.Rmd @@ -0,0 +1,223 @@ +--- +title: "KRI Method" +description: "This vignette outlines the statistical methods used to evaluate Key Risk Indicators (KRIs) in {gsm}." +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{KRI Method} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +library(gsm.core) +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Overview + +This vignette outlines the statistical methods used to evaluate Key Risk Indicators (KRIs) in the {gsm} suite of packages. KRIs are metrics that allow users to measure pre-defined risks and determine the level of observed risk to data quality and patient safety in a clinical trial. The {gsm} suite of packages implements a standardized data pipeline to facilitate KRI analysis. Other vignettes provide an overview of this framework ([1](Cookbook.html) [2](DataModel.html), [3](DataAnalysis.html), +[4](https://gilead-biostats.github.io/gsm.reporting/articles/DataReporting.html)), and the statistical methods for this process are described in detail below. + +`{gsm.core}` calculates KRIs by defining a numerator and a denominator for each metric. Then by default, `{gsm.core}` calculates z-scores using a normal approximation with adjustment for over-dispersion to assign risk levels. + +For KRIs that are percentages (binary outcome), the numerator is the # of events and the denominator is the # of total participants, and we then apply the normal approximation of the binomial distribution to determine a risk level. + +For KRIs that are rates (count outcome), the numerator is the # of events and the denominator is the total participant exposure or study duration, and we then apply the normal approximation of the Poisson distribution to determine a risk level. + +Alternative statistical methods to calculate standardized scores are also available in `{gsm.core}`, including the Identity, Fisher and Poisson methods. More details are provided below. + +# Statistical Methods + +## 1. The Normal Approximation Method + +### Introduction + +This method applies normal approximation of binomial distribution to the binary outcome KRIs, or normal approximation of Poisson distribution for the rate outcome KRIs (the sample sizes or total exposure of the sites) to assess data quality and safety. The control limits based on the asymptotic normal approximation are constructed to as risk thresholds for identifying site-level risks. + +Reference: Zink, Richard C., Anastasia Dmitrienko, and Alex Dmitrienko. **Rethinking the clinically based thresholds of TransCelerate BioPharma for risk-based monitoring.** *Therapeutic Innovation & Regulatory Science* 52, no. 5 (2018): 560-571. + +### Methods + +#### Binary + +Consider the problem of monitoring KRIs with binary outcomes, such as protocol deviation or discontinuation from the study, across multiple sites in a clinical trial. Assume that there are $m$ sites with $n_i$ patients at the $i$ th site, $i = 1, 2, \dots, m$. Denote the total number of patients in the study by $n=\sum_{i=1}^m n_i$. Let $X_{ij}$ signify the outcome of interest for the $j$ th patient at the $i$ th site, where $X_{ij}=1$ indicates that an event has occurred and indicates that an event has not occurred. Finally, let $p_i$ denote the site-level proportion at the $i$ th site. Monitoring tools focus on testing the null hypothesis of consistency of the true site-level proportion across multiple sites. Specifically, the null hypothesis states that the site-level proportion of the binary outcome is constant across the sites, that is, $H_0: p_1 = \dots = p_m = p$, where $p$ is the common proportion. This common proportion can be estimated as $\hat{p} = \frac{1}{n}\sum_{i=1}^m\sum_{j=1}^{n_i}X_{ij}$. + +The control limits are computed using confidence limits based on an asymptotic normal approximation. A 95% +confidence interval is obtained if the significance level $\alpha=0.05$. Let $X_i=\sum_{j=1}^{n_i}X_{ij}$ represent the total number of events that occur and let $\hat{p}_i=X_i/n_i$ denote the estimated event rate at the $i$ th site. The asymptotic $100(1 – \alpha)%$ confidence interval for $p_i$ is given by $$ +\hat{p}_i-z_{1-\alpha/2}\sqrt{\frac{\hat{p}_i(1-\hat{p}_i)}{n_i}} \leq p_i \leq \hat{p}_i+z_{1-\alpha/2}\sqrt{\frac{\hat{p}_i(1-\hat{p}_i)}{n_i}} +$$ where $z_{1-\alpha/2}$ is the upper percentile of the standard normal distribution. To construct the control limits for the observed event rate at this site, the estimated event rate is forced to be equal to the overall event rate $\hat{p}_i$. This means that the lower (l) and upper (u) asymptotic control limits for the $i$ th site are defined as $l_i=\hat{p}-z_{1-\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$ and $u_i=\hat{p}+z_{1-\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$, respectively. Asymptotic control limits may not be reliable in smaller clinical trials, so exact limits for an event rate may be preferable. + +#### Rate + +Assume that the distribution of number of events up to time $T$ is Poisson with mean $\lambda t$, where $\lambda$ is the event rate for a given unit of time. For the $i$ th site with $X_i=\sum_{j=1}^{N_i}X_{ij}$ events and $T_i=\sum_{j=1}^{N_i}t_{ij}$ exposure, define the exposure-adjusted incidence rate (EAIR) as $\hat{\lambda}_i=\frac{X_i}{T_i}$. For all sites, define $X=\sum_{i=1}^{m}X_{i}$ and $T=\sum_{i=1}^{m}t_{i}$ with $\hat{\lambda}=\frac{X}{T}$. Under a normal approximation, $100(1 – a)%$ confidence interval for the $i$ th site is $$ +\hat{\lambda}_i-z_{1-\alpha/2}\sqrt{\frac{\hat{\lambda}_i}{T_i}} \leq p_i \leq \hat{\lambda}_i+z_{1-\alpha/2}\sqrt{\frac{\hat{\lambda}_i}{T_i}} +$$. For these funnel plots accounting for exposure, the x-axis representing the site sample size ($n$) in the above examples is replaced by the total exposure time $T$. To develop a funnel plot, fix $\hat{\lambda}_i=\hat{\lambda}$, and vary $T$ from $min(T_i)$ to $max(T_i)$ to compute the control limits. As an area of future research, the work of Chan and Wang (2009) may suggest methods appropriate for computing an exact confidence interval for the EAIR. Finally, similar methods can be applied for a count-type endpoint $X_{ij}$, where tij would denote the time on study for the $j$ th patient at the $i$ th site. + +### KRI Metric and Z-score + +The KRI metric along with a KRI score are created for each site to measure the level of observed risk to data quality and patient safety in a clinical trial. For scoring purposes, Z-scores from the normal approximation are calculated and defined as such: $z_i=\frac{y_i-\theta_0}{\sqrt{V(Y|\theta_0)}}$ for site $i$, where $y_i$ is the KRI metric calculated for site $i$, $\theta_0$ is the overall mean, $\sqrt{V(Y|\theta_0)}$ is the measurement of variance. + +For binary outcome, $\sqrt{V(Y|\theta_0)}=\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$. + +For rate outcome, $\sqrt{V(Y|\theta_0)}=\sqrt{\frac{\hat{\lambda}}{T_i}}$. + +### Over-dispersion adjustment + +The standard normal approximation method described above assumes the null distribution fully expresses the variability of the sites in-control, but in many situations this assumption will not hold. In the situation that there is a presence of greater variability than expected, majority of the sites will fall outside the specified limits, leading to a double of the appropriateness of the constructed limits. + +A way of handling this issue is to allow over-dispersion in the normal approximation. A multiplicative over-dispersion adjustment was implemented in our approach. + +Suppose a sample of $m$ units are to be **in-control**, the over-dispersion factor $\phi$ can be estimated as the mean squared z-scores, i.e., $\hat\phi = \frac{1}{m}\sum_{i=1}^m z_i^2$. +For binary outcome, the over-dispersion adjusted variance is $V'(Y_i|\phi, p)=\phi\frac{{p}(1-p)}{n_i}$. +For rate outcome, the over-dispersion adjusted variance is $V'(Y_i|\phi, \lambda)=\phi\frac{\lambda}{T_i}$. +Therefore, after the over-dispersion adjustment, the adjusted z-scores for site $i$ are $z_i = \frac{\hat{p}_i - \hat{p}}{\sqrt{\hat\phi \frac{{\hat{p}}(1-\hat{p})}{n_i}}}$, $z_i = \frac{\hat{\lambda}_i - \hat{\lambda}}{\sqrt{\hat\phi \frac{\hat\lambda}{T_i}}}$, respectively. + +Reference: Spiegelhalter, David J. **Funnel plots for comparing institutional performance.** *Statistics in medicine* 24.8 (2005): 1185-1202. + +### Estimate and Score + +The function `Analyze_NormalApprox()` in `{gsm.core}` calculates adjusted z-score for each site as discussed above. The adjusted z-scores are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. + +### Threshold + +By default, sites with adjusted z-score exceeding $\pm 2$ or $\pm 3$ from the normal approximation analysis are flagged as amber or red, respectively. The thresholds are set at common choices corresponding to 95.6% and 99.7% of the data around the mean in a standard normal distribution. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. + +### Special Situations + +1. Results are not interpretable or it is not appropriate to apply the asymptotic method: We don't want to flag in certain situations when results not interpretable or when it is not appropriate to apply the asymptotic method due to the small sample sizes. The default threshold for minimum denominator requirement is 30 days exposure or 3 patients at the site level. + +### Recommendation + +Normal approximation method can be used in all scenarios with binary or rate KRIs. + +## 2. The Identity Method + +Identity method simply uses the count of event in the numerator of the KRI metric itself as the score. The thresholds for monitoring site risk are set based on the actual counts. + +## 3. The Fisher's Exact Method + +### Introduction + +For the binary outcome KRIs, an optional method in `{gsm.core}` is implemented with Fisher's exact test. + +Fisher's exact test is a statistical significance test used in the analysis of contingency tables when we have nominal variables and want to find out if proportions for one variable are different among values of the other variables. + +In contrast to large-sample based asymptotic statistics which rely on approximation, Fisher's exact test can be applied when sample sizes are small. + +The function `Analyze_Fisher` in `{gsm.core}` utilizes `stats::fisher.test` to generate an estimate of odds ratio as well as p-value using the Fisher's exact test with site-level count data. For each site, Fisher's exact test is conducted by comparing to all other sites combined in a 2×2 contingency table. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers. The default in `stats::fisher.test` uses a two-sided test (equivalent to testing the null of OR = 1) and does not compute p-values by Monte Carlo simulation unless `simulate.p.value = TRUE`. Sites with p-values less than 0.05 from the Fisher's exact test analysis are flagged by default. The significance level was set at a common choice. + +### Methods + +For example, in a $2 \times 2$ contingency table comparing a particular site to all other sites combined, the two rows displaying the binary outcome are considered repeated Bernoulli random samples with same probability $p=0.5$ of success or failure under the null. Given a $2 \times 2$ contingency table, + +```{r echo = FALSE, results = 'asis'} +library(gt) +table1<-data.frame(Site1=c("a","b"), RestSites=c("c","d")) +rownames(table1)<-c("Yes", "No") +gt::gt(table1) +``` + +Fisher (1922) showed that conditional on the margins of the table, $a$ is distributed as a hypergeometric distribution with $a+c$ draws from a population with $a+b$ successes and $c+d$ failures. Let $n=a+b+c+d$, the probability of obtaining such set of values is given by: + +$$ +p=\frac{{{a+b} \choose a} {{c+d} \choose c}}{{n \choose {a+c}}}=\frac{{{a+b} \choose b} {{c+d} \choose d}}{{n \choose {b+d}}}=\frac{(a+b)!(c+d)!(a+c)!(b+d)!}{a! b! c! d! n!}. +$$ + +### Estimate and Score + +The function `Analyze_Fisher()` in `{gsm.core}` utilizes `stats::fisher.test()` to generate an estimate of odds ratio as well as p-value using the Fisher's exact test with site-level count data. For each site, Fisher's exact test is conducted by comparing to all other sites combined in a $2 \times 2$ contingency table. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. The default in `stats::fisher.test()` uses two-sided test (equivalent to testing the null: OR=1) and not to compute p-values by Monte Carlo simulation unless `simulate.p.value = TRUE` is specified. + +### Threshold + +By default, sites with p-values less than 0.05 or 0.01 from the Fisher's exact test analysis are flagged as amber or red, respectively. The thresholds are set based on empirical p-value approach, where we use the distribution of the p-values to find the best separation of the data to identify sites at risk. The default thresholds are set at common choices of significance levels. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. + + +### The Fisher's exact test assumptions + +1. The row totals and the column totals are both fixed by design. + +2. The samples are mutually exclusive and mutually independent. + +The assumptions can be assessed by the knowledge of data collected. No assumption check is necessary. + +### Special situations + +1. Functionally: where we don't have required input to run Fishers: p-value will be set `NA`. + +2. Results not interpretable: we don't want to flag in certain situations when results not interpretable due to small sample sizes. The default threshold for minimum denominator requirement is 3 patients at the site level. + +3. An observed zero cell is not an issue when using Fisher's exact test, however, when the expected cell is zero, it means either the marginal is zero (meaningless) or there are structural zeros (need to consider zero-inflated issue: West, L. and Hankin, R. (2008), "Exact Tests for Two-Way Contingency Tables with Structural Zeros," Journal of Statistical Software, 28(11), 1--19). + +### constraints + +For small samples, Fisher's exact test is highly discrete. Fisher's exact test is often considered to be more conservative. This may due to the use a discrete statistic with fixed significance levels ([FET Controversies Wiki](https://en.wikipedia.org/wiki/Fisher%27s_exact_test#Controversies)). + +Although in practice, Fisher's exact test is usually used when sample sizes are small (e.g., n\<5), it is valid for all sample sizes. However, when sample sizes are large, the computation of the exact test evaluating the hypergeometric probability function given the marginal can take a very long time. + +### Recommendation + +Fisher's exact test can be used in all scenarios with binary KRIs. + + +## 4. The Poisson Regression Method + +### Introduction + +For the rate outcome KRIs, an optional method in `{gsm.core}` is implemented with Poisson regression. + +The Poisson distribution is often used to model count data. If $Y$ is the number of counts following Poisson distribution, the probability mass function is given by $$ +f(y)=\frac{\mu^ye^{-\mu}}{y!} +$$ where $\mu$ is the average number of counts and $E(Y)=Var(Y)=\mu$. + +### Methods + +This method fits a Poisson model to site-level data and then calculates deviance residuals for each site. The Poisson model is run using standard methods in the `stats` package by fitting a `glm` model with family set to `poisson` using a "log" link. Site-level deviance residuals are calculated using `resid` from `stats::predict.glm` via `broom::augment`. + +Let $Y_1, ..., Y_N$ be independent random variables with $Y_i \sim Poisson(\mu_i)$ denoting the number of events observed from $n_i$ for the $i$th observation following Poisson distribution. Then $E(Y_i)=\mu_i=n_ie^{x_i\beta}$. Thus,the log-linear generalized linear model (Poisson regression) is +$$ +\log{\mu_i}=\log{n_i}+x_i\beta \quad Y_i \sim Poisson(\mu_i) +$$ + +where $\log{n_i}$ is an offset term. + +### Estimate and Score + +The function `Analyze_Poisson()` in `{gsm.core}` utilizes `stats::glm()` to generate an estimate of fitted values as well as deviance residual with site-level count data. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. + +### Threshold + +By default, sites with deviance residuals exceeding $\pm 5$ or $\pm 7$ from the Poisson analysis are flagged as amber or red, respectively. The thresholds are set based on empirical approach, where we use the distribution of the deviance residuals to find the best separation of the data to identify sites at risk. The default thresholds are set at empirical values based on pilot studies' data. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. + + +### Special Situations + +1. Results are not interpretable or it is not appropriate to apply the Poisson method: We don't want to flag in certain situations when results not interpretable or when it is not appropriate to apply the Poisson method due to the small sample sizes. The default threshold for minimum denominator requirement is 30 days exposure at the site level. + + + +### Poisson regression assumptions + +1. **Independence** The responses $y_i$ are independent of each other. + +2. **Count data** The responses $y_i$ are non-negative integer (counts). + +3. **Poisson response** Each $Y_i$ follows the Poisson distribution as noted above with mean and variance equal to $\mu_i$. + +4. **Linearity** $\log{\mu_i}=\log{n_i}+x_i\beta$ where $x_i$ are independent predictors. + +### Assumption checks, constraints and model diagnosis + +1. The assumptions on independence and counted data can be assessed by the knowledge of data collected. + +2. The assumptions on Poisson response can be checked by plotting histogram of the data and comparing empirical mean and variance stratified by the explanatory variable(s). If there is evidence that the assumption of mean=variance is violated, oftentimes we observe variance\>mean. This is called overdispersion. In this case, negative binomial distribution provides an alternative where $Var(Y_i)=\phi E(Y_i)$. + +3. Diagnosis: Goodness of fit test (chi-squared) and deviance residuals. Residuals vs fitted plot. Q-Q plot. + +4. Other considerations: Structural zeros may happen in contrast to random zeros due to sampling from poisson distribution. In this case, a mixture model (zero-inflated Poisson model) may be required. + +### Recommendation + +Use this method when Poisson assumptions hold. diff --git a/vignettes/gsmExtensions.Rmd b/vignettes/gsmExtensions.Rmd index 8d02fa5..eaa38b1 100644 --- a/vignettes/gsmExtensions.Rmd +++ b/vignettes/gsmExtensions.Rmd @@ -1,5 +1,6 @@ --- title: "gsm Extensions" +description: "This vignette describes how to extend {gsm.core} by creating new 'modules', including metrics, reports and shiny apps that can be run using the standard gsm pipeline." output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{gsm Extensions} @@ -16,9 +17,9 @@ knitr::opts_chunk$set( # Overview -This vignette describes how to extend `gsm` by creating new "modules", including metrics, reports and shiny apps that can be run using the standard `gsm` pipeline described in these vignettes (`vignette("DataAnalysis")`, `vignette("DataReporting")`). As shown in the `vignette("DataAnalysis")`, the `gsm` data pipeline can be used to capture a monitoring 'snapshot' for a study that includes a variety of "modules" including [metrics](https://github.com/Gilead-BioStats/gsm/tree/dev/inst/workflow/metrics/kri0001.yaml) and [reports](https://github.com/Gilead-BioStats/gsm/blob/dev/inst/workflow/reports/report_kri_site.yaml). Some core modules are included in the `gsm` package, while others can be added as extensions. +This vignette describes how to extend `{gsm.core}` by creating new "modules", including metrics, reports and shiny apps that can be run using the standard `gsm` pipeline described in these vignettes (`vignette("DataAnalysis")`, and [DataReporting](https://gilead-biostats.github.io/gsm.reporting/articles/DataReporting.html). As shown in the `vignette("DataAnalysis")`, the existing `gsm` data pipeline can be used to capture a monitoring 'snapshot' for a study that includes a variety of "modules" including [metrics](https://github.com/Gilead-BioStats/gsm.mapping/tree/dev/inst/workflow/metrics/kri0001.yaml) and [reports](https://github.com/Gilead-BioStats/gsm.kri/blob/dev/inst/workflow/reports/report_kri_site.yaml). -This vignette provide detailed specifications for creating new modules, a description of the directory structure for the yaml workflows that comprise a module pipeline, and links to resources that can be used to configure study-level gsm pipelines that utilize these `gsm` extensions. +This vignette provide detailed specifications for creating new modules, a description of the directory structure for the yaml workflows that comprise a module pipeline, and links to resources that can be used to configure study-level gsm pipelines that utilize these extensions. # Module Configuration @@ -34,8 +35,8 @@ Detailed specifications for each of these sections are provided below. Here are links to several sample module configuration files: -- [12 Standard gsm KRIs](https://github.com/Gilead-BioStats/gsm/tree/dev/inst/workflow/metrics) (e.g. [Adverse Event KRI Metric](https://github.com/Gilead-BioStats/gsm/tree/dev/inst/workflow/metrics/kri0001.yaml)) -- [Site-](https://github.com/Gilead-BioStats/gsm/blob/dev/inst/workflow/reports/report_kri_site.yaml) and [Country-level KRI Report](https://github.com/Gilead-BioStats/gsm/blob/dev/inst/workflow/reports/report_kri_country.yaml) +- [12 Standard gsm KRIs](https://github.com/Gilead-BioStats/gsm.kri/tree/dev/inst/workflow/2_metrics) (e.g. [Adverse Event KRI Metric](https://github.com/Gilead-BioStats/gsm.kri/tree/dev/inst/workflow/2_metrics/kri0001.yaml)) +- [Site-](https://github.com/Gilead-BioStats/gsm.kri/blob/dev/inst/workflow/4_modules/report_kri_site.yaml) and [Country-level KRI Report](https://github.com/Gilead-BioStats/gsm.kri/blob/dev/inst/workflow/4_modules/report_kri_country.yaml) ## `meta` Specification @@ -49,7 +50,7 @@ The `meta` section of a workflow YAML provides key metadata describing the modul - `Details`: *optional* A more detailed description of the module specified in the workflow. - `Repo`: Package repo and version. Should be compatible with the `repo` parameter in `remotes::install_github()`. - `Status`: The validation status of the reporting output. Valid values: - - `Qualified`: Output has been qualified via our qualification process specified [here](https://gilead-biostats.github.io/gsm/articles/QualificationWorkflow.html). + - `Qualified`: Output has been qualified via our qualification process specified [here](https://gilead-biostats.github.io/gsm.qc/articles/QualificationWorkflow.html). - `Pilot`: Output is being used by pilot studies and is maintained in a package repository. - `Prototype`: Output is created using custom scripts on an ad-hoc basis. @@ -59,7 +60,7 @@ Additional `meta` header required fields for **Modules**: - `Output`: The output of the workflow, including format. Each workflow should only produce a single reporting output. - `ExampleURL`: Location of a sample report. For html reports, this is typically a page on the pkgdown site (ending with "/{ModuleID}.html"), or a sample app deployed on [shinyapps.io](https://shinyapps.io). -Additional `meta` header required fields for `{gsm}` **metrics**: +Additional `meta` header required fields for `{gsm.kri}` **metrics**: - `GroupLevel`: The level at which the metric is calculated. Common values: `Site`, `Country`. - `Abbreviation`: Abbreviation of the metric. @@ -101,7 +102,7 @@ The `spec` section of the workflow YAML is formatted as a list of data tables, w ### `spec` example: Metric Module -Metric `spec`s are typically pulled from the `mapped` data layer. For example, the `spec` section for the [AE KRI metric](https://github.com/Gilead-BioStats/gsm/tree/dev/inst/workflow/metrics/kri0001.yaml) is: +Metric `spec`s are typically pulled from the `mapped` data layer. For example, the `spec` section for the [AE KRI metric](https://github.com/Gilead-BioStats/gsm.kri/tree/dev/inst/workflow/2_metrics/kri0001.yaml) is: ``` spec: @@ -121,7 +122,7 @@ So, in summary, the AE KRI metric requires two data tables, `Mapped_AE` and `Map ### `spec` examples: Report Module -Report modules most often pull data from the `Reporting` data layer. For example, the [Site-level KRI report](https://github.com/Gilead-BioStats/gsm/blob/dev/inst/workflow/reports/report_kri_site.yaml) has the following `spec`: +Report modules most often pull data from the `Reporting` data layer. For example, the [Site-level KRI report](https://github.com/Gilead-BioStats/gsm.kri/blob/dev/inst/workflow/4_modules/report_kri_site.yaml) has the following `spec`: ``` spec: @@ -132,7 +133,7 @@ spec: Note that the `_all` key word is used to specify that all standard columns from the `Reporting_Results` data table are expected and that the table required - without it, the report can't run. The other `Reporting` tables are used to enhance the report, but are not required, and thus not included in the spec. -The `Mapped` data layer is also available for use in reports and apps. Most typically, mapped data is used to drill down from high-level metric findings (e.g. "Site 5 has an elevated AE rate relative to other studies") to site- or participant- level details (e.g. "Participant 00016 from Site 5 had 5 AEs and 3 SAEs reported in the last 3 months."). For example, the [Deep Dive app]() includes both Reporting and Mapped data in its `spec`. Here is a representative excerpt from the `spec`: +The `Mapped` data layer is also available for use in reports and apps. Most typically, mapped data is used to drill down from high-level metric findings (e.g. "Site 5 has an elevated AE rate relative to other studies") to site- or participant- level details (e.g. "Participant 00016 from Site 5 had 5 AEs and 3 SAEs reported in the last 3 months."). For example, the [Deep Dive app](https://openrbqm.shinyapps.io/gsm-app/) includes both Reporting and Mapped data in its `spec`. Here is a representative excerpt from the `spec`: ``` spec: @@ -153,32 +154,32 @@ spec: Finally, each module yaml configuration file should have a `steps` property that describes in detail how the module is run. The `steps` section is a list of functions that are run in sequence to produce the final output. Each item in `steps` has the following properties: -- `name`: The name of the function to be run. This must be a function that is available in `{gsm}` package or in a package that is listed in the `repo` section of the `meta` header. +- `name`: The name of the function to be run. This is typically be a function that is available in one of `{gsm}` packages or in a package that is listed in the `repo` section of the `meta` header. - `output`: The name of the output of the function. This is the name of the data table that is created by the function. - `params`: A list of parameters that are passed to the function. The parameters are specific to the function that is being run. See below for more details on how to specify parameters for each function. -**Note**: It is important to note that the default behavior of the `RunWorkflow()` and `RunWorkflows()` functions is to return the *last* output in the steps section of the workflow. therefore, each yaml file- regardless of which directory it is in- should only produce one output, whether that be a data table, list, html output, deployed shiny app, or any other object needed to produce the module output. +**Note**: It is important to note that the default behavior of the `gsm.core::RunWorkflow()` and `gsm.core::RunWorkflows()` functions is to return the *last* output in the steps section of the workflow. therefore, each yaml file- regardless of which directory it is in- should only produce one output, whether that be a data table, list, html output, deployed shiny app, or any other object needed to produce the module output. -The `steps` is the most complex part of the module configuration and will vary greatly depending on the module type and the specific requirements of the module. `gsm` provides several functions that allow for module yaml files to be run in a standard way. See `?gsm::RunWorkflow()` for more details. +The `steps` is the most complex part of the module configuration and will vary greatly depending on the module type and the specific requirements of the module. `gsm.core` provides several functions that allow for module yaml files to be run in a standard way. See `?gsm.core::RunWorkflow()` for more details. ### `steps[]$params` Specification -After processing the YAML `meta` and `spec` sections, `gsm::RunWorkflow()` calls `gsm::RunStep()` for each step in the `steps` section of the YAML. The `params` section of each step is passed to `RunStep()` as a list of parameters along with a copy of the metadata header (`lMeta`) and any data (`lData`). `RunStep()` then parses the list of `params` by passing data from `lMeta` and `lData` when appropriate - see `?RunStep` for a detailed of how parameter values are populated. Finally, the parsed parameters are passed to the function specified in the `name` field of the step. +After processing the YAML `meta` and `spec` sections, `gsm.core::RunWorkflow()` calls `gsm.core::RunStep()` for each step in the `steps` section of the YAML. The `params` section of each step is passed to `gsm.core::RunStep()` as a list of parameters along with a copy of the metadata header (`lMeta`) and any data (`lData`). `gsm.core::RunStep()` then parses the list of `params` by passing data from `lMeta` and `lData` when appropriate - see `?gsm.core::RunStep` for a detailed of how parameter values are populated. Finally, the parsed parameters are passed to the function specified in the `name` field of the step. ### `steps` examples #### `Metric` steps example -In the example below, the steps to produce the AE analysis output is specified. Here, `Threshold`, `GroupLevel`, `Type` and `nMinDenominator` are specified in the `meta` section of the workflow, and would be access via the `paramVal` process discussed above. As a default, the output of these steps as run with `RunWorkflows()` would be a list of data tables, as specified in the final `list` step of the workflow. +In the example below, the steps to produce the AE analysis output is specified. Here, `Threshold`, `GroupLevel`, `Type` and `nMinDenominator` are specified in the `meta` section of the workflow, and would be access via the `paramVal` process discussed above. As a default, the output of these steps as run with `gsm.core::RunWorkflows()` would be a list of data tables, as specified in the final `list` step of the workflow. ``` steps: - - name: ParseThreshold + - name: gsm.core::ParseThreshold output: vThreshold params: strThreshold: Threshold - - name: Input_Rate + - name: gsm.core::Input_Rate output: Analysis_Input params: dfSubjects: Mapped_SUBJ @@ -190,21 +191,21 @@ steps: strNumeratorMethod: Count strDenominatorMethod: Sum strDenominatorCol: timeonstudy - - name: Transform_Rate + - name: gsm.core::Transform_Rate output: Analysis_Transformed params: dfInput: Analysis_Input - - name: Analyze_NormalApprox + - name: gsm.core::Analyze_NormalApprox output: Analysis_Analyzed params: dfTransformed: Analysis_Transformed strType: AnalysisType - - name: Flag_NormalApprox + - name: gsm.core::Flag output: Analysis_Flagged params: dfAnalyzed: Analysis_Analyzed vThreshold: vThreshold - - name: Summarize + - name: gsm.core::Summarize output: Analysis_Summary params: dfFlagged: Analysis_Flagged @@ -222,28 +223,28 @@ steps: #### `Report` steps example -In this example, the steps to produce a site-level KRI report is displayed. Here, the only inputs are the `Reporting_*` data, which goes through a simple filtering process via `RunQuery` before the Charts and Report are created in the following two functions +In this example, the steps to produce a site-level KRI report is displayed. Here, the only inputs are the `Reporting_*` data, which goes through a simple filtering process via `gsm.core::RunQuery` before the Charts and Report are created in the following two functions ``` steps: - - name: RunQuery + - name: gsm.core::RunQuery output: Reporting_Results_Site params: df: Reporting_Results strQuery: "SELECT * FROM df WHERE GroupLevel == 'Site'" - - name: RunQuery + - name: gsm.core::RunQuery output: Reporting_Metrics_Site params: df: Reporting_Metrics strQuery: "SELECT * FROM df WHERE GroupLevel == 'Site'" - - name: MakeCharts + - name: gsm.kri::MakeCharts output: lCharts_Site params: dfResults: Reporting_Results_Site dfGroups: Reporting_Groups dfBounds: Reporting_Bounds dfMetrics: Reporting_Metrics_Site - - name: Report_KRI + - name: gsm.kri::Report_KRI output: lReport params: lCharts: lCharts_Site @@ -259,7 +260,7 @@ Each extension that produces report(s) will have a `workflow` directory in the ` ### `/1_mappings` -The mappings folder contains all of the mappings from `Raw_*` data to `Mapped_*` data. Each file within this directory is to be named for the data table it is creating, minus the `Mapped_` suffix. The yamls will contain the three required sections, which are discussed in detail in the `Module Configuration` section above. The yamls in this folder will be combined via `CombineSpecs()` to create a master spec that defines all necessary tables and columns for the module(s) in this package. +The mappings folder in the `{gsm.mapping}` package contains all of the mappings from `Raw_*` data to `Mapped_*` data. Each file within this directory is to be named for the data table it is creating, minus the `Mapped_` suffix. The yamls will contain the three required sections, which are discussed in detail in the `Module Configuration` section above. The yamls in this folder will be combined via `gsm.mapping::CombineSpecs()` to create a master spec that defines all necessary tables and columns for the module(s) in this package. Below are two examples of these mapping yaml files- the first which requires no transformations, and is very simple, and the second which requires multiple steps to produce the desired mapped data. @@ -306,13 +307,13 @@ spec: steps: # Merge [ subjid ] onto EDC domains. - output: Temp_SubjectLookup - name: select + name: dplyr::select params: .data: Mapped_SUBJ subjid: subjid subject_nsv: subject_nsv - output: Mapped_DATACHG - name: left_join + name: dplyr::left_join params: x: Raw_DATACHG "y": Temp_SubjectLookup @@ -321,7 +322,7 @@ steps: ### `/2_metrics` -The metrics directory contains all of the workflows that perform analysis steps, converting mapped data into metrics that are displayed in a report. In the case of `{gsm}`, these metrics are the 12 Key Risk Indicators, calculated at both the site- and country-level, that are discussed in the Data Analysis Step-by-Step Vignette. Each yaml in this file produces a list of analysis data tables that capture the formatted input table, the transformed table, the flagged table, and the summary table. In general, these yamls should at least provide a summary table that contains statistics about the metric at the specified level of aggregation. +The metrics directory contains all of the workflows that perform analysis steps, converting mapped data into metrics that are displayed in a report. In the case of `{gsm.kri}`, these metrics are the 12 Key Risk Indicators, calculated at both the site- and country-level, that are discussed in the Data Analysis Step-by-Step Vignette. Each yaml in this file produces a list of analysis data tables that capture the formatted input table, the transformed table, the flagged table, and the summary table. In general, these yamls should at least provide a summary table that contains statistics about the metric at the specified level of aggregation. Examples of these yamls can be found above in the `Module Configuration` section, as well as in the (`vignette("DataAnalysis")`) vignette. @@ -333,9 +334,9 @@ Examples of these yamls can be found above in the `Module Configuration` section ### `/4_modules` -The modules directory contains the final workflow(s) of the reporting pipeline. These workflows each produce a single output based on the data tables that have been produced in the previous directories. These module workflows will contain all of the necessary meta information, as detailed in the `Module Configuration` section above, along with the data tables required, and steps to produce it, so that `gsm::RunWorkflow()` can take this workflow and produce the module output. +The modules directory contains the final workflow(s) of the reporting pipeline. These workflows each produce a single output based on the data tables that have been produced in the previous directories. These module workflows will contain all of the necessary meta information, as detailed in the `Module Configuration` section above, along with the data tables required, and steps to produce it, so that `gsm.core::RunWorkflow()` can take this workflow and produce the module output. -Below is an example of the module yaml workflow for the KRI Site Report in `{gsm}` +Below is an example of the module yaml workflow for the KRI Site Report in `{gsm.kri}` ``` meta: @@ -344,11 +345,11 @@ meta: Output: html Name: Site-Level Key Risk Indicator Report Description: A report summarizing key risk indicators at the site level - Repo: gsm v2.1.0 + Repo: gsm.kri v1.0.0 Status: Qualified Permission: Users Outputs: An html report - ExampleURL: https://gilead-biostats.github.io/gsm/report_kri_site.html + ExampleURL: https://gilead-biostats.github.io/gsm.kri/report_kri_site.html spec: Reporting_Results: _all: @@ -363,24 +364,24 @@ spec: _all: required: true steps: - - name: RunQuery + - name: gsm.core::RunQuery output: Reporting_Results_Site params: df: Reporting_Results strQuery: "SELECT * FROM df WHERE GroupLevel == 'Site'" - - name: RunQuery + - name: gsm.core::RunQuery output: Reporting_Metrics_Site params: df: Reporting_Metrics strQuery: "SELECT * FROM df WHERE GroupLevel == 'Site'" - - name: MakeCharts + - name: gsm.kri::MakeCharts output: lCharts_Site params: dfResults: Reporting_Results_Site dfGroups: Reporting_Groups dfBounds: Reporting_Bounds dfMetrics: Reporting_Metrics_Site - - name: Report_KRI + - name: gsm.kri::Report_KRI output: lReport params: lCharts: lCharts_Site From 5b26075e2ca2a71fc1e8a8aac00d8bc1e70e81d4 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:16:42 -0400 Subject: [PATCH 02/22] example updates --- inst/examples/1_AdverseEventKRI.R | 10 +++-- inst/examples/2_AdverseEventWorkflow.R | 11 ++++-- inst/examples/3_ReportingWorkflow.R | 53 +++++++++++--------------- inst/examples/4_WorkflowIO.R | 2 +- vignettes/Cookbook.Rmd | 8 ++-- 5 files changed, 40 insertions(+), 44 deletions(-) diff --git a/inst/examples/1_AdverseEventKRI.R b/inst/examples/1_AdverseEventKRI.R index 774f17b..edd89ad 100644 --- a/inst/examples/1_AdverseEventKRI.R +++ b/inst/examples/1_AdverseEventKRI.R @@ -38,16 +38,18 @@ SAE_KRI <- Input_Rate( table(SAE_KRI$Flag) ### Example 1.3 - Visualize Metric distribution using Bar Charts using provided htmlwidgets +library(gsm.kri) + labels <- list( Metric= "Serious Adverse Event Rate", Numerator= "Serious Adverse Events", Denominator= "Days on Study" ) -Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Metric") -Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Score") -Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Numerator") +gsm.kri::Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Metric") +gsm.kri::Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Score") +gsm.kri::Widget_BarChart(dfResults = SAE_KRI, lMetric=labels, strOutcome="Numerator") ### Example 1.4 - Create Scatter plot with confidence bounds dfBounds <- Analyze_NormalApprox_PredictBounds(SAE_KRI, vThreshold = c(-3,-2,2,3)) -Widget_ScatterPlot(SAE_KRI, lMetric = labels, dfBounds = dfBounds) +gsm.kri::Widget_ScatterPlot(SAE_KRI, lMetric = labels, dfBounds = dfBounds) diff --git a/inst/examples/2_AdverseEventWorkflow.R b/inst/examples/2_AdverseEventWorkflow.R index 040b500..8ba5bc9 100644 --- a/inst/examples/2_AdverseEventWorkflow.R +++ b/inst/examples/2_AdverseEventWorkflow.R @@ -1,3 +1,6 @@ +library(gsm.mapping) +library(gsm.kri) + #### Example 2.1 - Configurable Adverse Event Workflow # Define YAML workflow @@ -79,8 +82,8 @@ lMappingWorkflows <- MakeWorkflowList( strPath = here::here("tests/testthat/testdata/mappings"), bExact = TRUE ) -mappings_spec <- CombineSpecs(lMappingWorkflows) -lRawData <- Ingest(gsm.core::lSource, mappings_spec) +mappings_spec <- gsm.mapping::CombineSpecs(lMappingWorkflows) +lRawData <- gsm.mapping::Ingest(gsm.core::lSource, mappings_spec) AE_data <-list( Mapped_SUBJ= lRawData$Raw_SUBJ, Mapped_AE= lRawData$Raw_AE @@ -96,7 +99,7 @@ AE_country_workflow$meta$GroupLevel <- "Country" AE_country_workflow$steps[[2]]$params$strGroupCol <- "country" AE_country_KRI <- RunWorkflow(lWorkflow = AE_country_workflow, lData = AE_data) -Widget_BarChart(dfResults = AE_country_KRI$Analysis_Summary, lMetric = AE_country_workflow$meta) +gsm.kri::Widget_BarChart(dfResults = AE_country_KRI$Analysis_Summary, lMetric = AE_country_workflow$meta) #### Example 2.3 - Create SAE workflow @@ -119,7 +122,7 @@ SAE_workflow$steps <- SAE_workflow$steps %>% append(filterStep, after=0) # Run the updated workflow SAE_KRI <- RunWorkflow(lWorkflow = SAE_workflow, lData = AE_data ) -Widget_BarChart(dfResults = SAE_KRI$Analysis_Summary, lMetric = SAE_workflow$meta) +gsm.kri::Widget_BarChart(dfResults = SAE_KRI$Analysis_Summary, lMetric = SAE_workflow$meta) diff --git a/inst/examples/3_ReportingWorkflow.R b/inst/examples/3_ReportingWorkflow.R index 5d679f7..71a70d6 100644 --- a/inst/examples/3_ReportingWorkflow.R +++ b/inst/examples/3_ReportingWorkflow.R @@ -2,6 +2,7 @@ library(gsm.core) library(gsm.mapping) library(gsm.kri) library(gsm.reporting) +library(dplyr) #### 3.1 - Create a KRI Report using 12 standard metrics in a step-by-step workflow @@ -40,67 +41,57 @@ lRaw <- list( ) # Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer -mappings_wf <- MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") -mapped <- RunWorkflows(mappings_wf, lRaw) +mappings_wf <- gsm.core::MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") +mapped <- gsm.core::RunWorkflows(mappings_wf, lRaw) # Step 2 - Create Metrics - calculate metrics using mapped data -metrics_wf <- MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") -analyzed <- RunWorkflows(metrics_wf, mapped) +metrics_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") +analyzed <- gsm.core::RunWorkflows(metrics_wf, mapped) # Step 3 - Create Reporting Layer - create reports using metrics data -reporting_wf <- MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") -reporting <- RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, +reporting_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") +reporting <- gsm.core::RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, lWorkflows = metrics_wf))) # Step 4 - Create KRI Reports - create KRI report using reporting data -module_wf <- MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") -lReports <- RunWorkflows(module_wf, reporting) +module_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") +lReports <- gsm.core::RunWorkflows(module_wf, reporting) #### 3.2 - Automate data ingestion using Ingest() and CombineSpecs() # Step 0 - Data Ingestion - standardize tables/columns names -mappings_wf <- MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") -mappings_spec <- CombineSpecs(mappings_wf) -lRaw <- Ingest(gsm.core::lSource, mappings_spec) +mappings_wf <- gsm.core::MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") +mappings_spec <- gsm.mapping::CombineSpecs(mappings_wf) +lRaw <- gsm.mapping::Ingest(gsm.core::lSource, mappings_spec) # Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer -mapped <- RunWorkflows(mappings_wf, lRaw) +mapped <- gsm.core::RunWorkflows(mappings_wf, lRaw) # Step 2 - Create Metrics - calculate metrics using mapped data -metrics_wf <- MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") -analyzed <- RunWorkflows(metrics_wf, mapped) +metrics_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") +analyzed <- gsm.core::RunWorkflows(metrics_wf, mapped) # Step 3 - Create Reporting Layer - create reports using metrics data -reporting_wf <- MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") -reporting <- RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, +reporting_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") +reporting <- gsm.core::RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, lWorkflows = metrics_wf))) # Step 4 - Create KRI Report - create KRI report using reporting data -module_wf <- MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") -lReports <- RunWorkflows(module_wf, reporting) +module_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") +lReports <- gsm.core::RunWorkflows(module_wf, reporting) -#### 3.4 - Combine steps in to a single workflow -#ss_wf <- MakeWorkflowList(strNames = "Snapshot") -#lReports <- RunWorkflows(ss_wf, lSource) - -#### 3.4 - Use Study configuration to specify data sources -# StudyConfig <- Read_yaml("inst/workflow/config.yaml") -# mapped <- RunWorkflows(mappings_wf, lConfig=StudyConfig) -# analyzed <- RunWorkflows(metrics_wf, lConfig=StudyConfig) -# reporting <- RunWorkflows(reporting_wf, lConfig=StudyConfig) -# lReports <- RunWorkflows(module_wf, lConfig=StudyConfig) #### 3.3 Site-Level KRI Report with multiple SnapshotDate # Below relies on the clindata stuff, do we need to rerun/rewrite reporting datasets? -lCharts <- MakeCharts( +lCharts <- gsm.kri::MakeCharts( dfResults = gsm.core::reportingResults, dfGroups = gsm.core::reportingGroups, dfMetrics = gsm.core::reportingMetrics, dfBounds = gsm.core::reportingBounds ) -kri_report_path <- Report_KRI( +kri_report_path <- gsm.kri::Report_KRI( lCharts = lCharts, - dfResults = FilterByLatestSnapshotDate(reportingResults), + dfResults = gsm.kri::FilterByLatestSnapshotDate(reportingResults), dfGroups = gsm.core::reportingGroups, dfMetrics = gsm.core::reportingMetrics ) diff --git a/inst/examples/4_WorkflowIO.R b/inst/examples/4_WorkflowIO.R index 9024811..18d347d 100644 --- a/inst/examples/4_WorkflowIO.R +++ b/inst/examples/4_WorkflowIO.R @@ -1,4 +1,4 @@ -load_all() +devtools::load_all() LoadData <- function(lWorkflow, lConfig, lData = NULL) { lData <- lData diff --git a/vignettes/Cookbook.Rmd b/vignettes/Cookbook.Rmd index ec20f17..64b3ae5 100644 --- a/vignettes/Cookbook.Rmd +++ b/vignettes/Cookbook.Rmd @@ -51,8 +51,8 @@ This example uses the standard {gsm} analysis workflows to creates site-level Ad - **Example 1.1** calculates the Site-level AE rates. - **Example 1.2** adds a filter to include only Serious Adverse Events (SAEs) and implements pipes to run through the workflow. -- **Example 1.3** generates bar charts showing SAE rates and z-scores by study. -- **Example 1.4** generates a scatter plot with confidence bound for SAE rates. +- **Example 1.3** generates bar charts showing SAE rates and z-scores by study using `{gsm.kri}`. +- **Example 1.4** generates a scatter plot with confidence bound for SAE rates using `{gsm.kri}`. ```{r file = system.file("examples", "1_AdverseEventKRI.R", package = "gsm.core"), eval = FALSE, include = TRUE} @@ -75,8 +75,8 @@ This examples introduces YAML workflows to re-generate the same results as in ** This example extends the previous examples to generate charts and reports for multiple KRIs. See the [Data Reporting Vignette](https://gilead-biostats.github.io/gsm.reporting/articles/DataReporting.html) for more detail. - **Example 3.1** steps through several workflows to generate a report for all 12 standard site-level KRIs. -- **Example 3.2** automates data ingestion using `Ingest()` and `CombineSpecs()`. -- **Example 3.3** generates a report incorporating multiple timepoints using the sample `reporting` data saved as part of {gsm}. +- **Example 3.2** automates data ingestion using `gsm.mapping::Ingest()` and `gsm.mapping::CombineSpecs()`. +- **Example 3.3** generates a report using `{gsm.kri}` incorporating multiple timepoints using the sample `reporting` data saved as part of `{gsm.core}`. ```{r file = system.file("examples", "3_ReportingWorkflow.R", package = "gsm.core"), eval = FALSE, include = TRUE} From ec3015573fbaad62099ddea57a84ba2e0aebbd96 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:19:02 -0400 Subject: [PATCH 03/22] add reports to pkgdown --- .github/workflows/pkgdown.yaml | 74 +++++++++++++++++++++++----------- _pkgdown.yml | 9 ++++- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index bfc9f4d..dc424a9 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,29 +1,22 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] - pull_request: - release: - types: [published] + branches: [main, dev] workflow_dispatch: -name: pkgdown.yaml - -permissions: read-all +name: pkgdown-main jobs: - pkgdown: + pkgdown-main: runs-on: ubuntu-latest - # Only restrict concurrency for non-PR jobs - concurrency: - group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} permissions: contents: write + steps: - uses: actions/checkout@v4 + with: + ref: main - uses: r-lib/actions/setup-pandoc@v2 @@ -33,17 +26,52 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, local::. + extra-packages: any::pkgdown needs: website - - name: Build site - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + - name: Generate sample reports shell: Rscript {0} + run: | + devtools::load_all() + dir.create(here::here("pkgdown", "assets"), recursive = TRUE) + library(gsm.kri) - - name: Deploy to GitHub pages 🚀 - if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.5.0 - with: - clean: false - branch: gh-pages - folder: docs + lCharts <- gsm.kri::MakeCharts( + dfResults = gsm.core::reportingResults, + dfGroups = gsm.core::reportingGroups, + dfMetrics = gsm.core::reportingMetrics, + dfBounds = gsm.core::reportingBounds + ) + + gsm.kri::Report_KRI( + lCharts = lCharts, + dfResults = gsm.kri::FilterByLatestSnapshotDate(gsm.core::reportingResults), + dfGroups = gsm.core::reportingGroups, + dfMetrics = gsm.core::reportingMetrics, + strOutputDir = here::here("pkgdown", "assets"), + strOutputFile = "report_kri_site.html" + ) + + ## Country Report + lCharts_country <- gsm.kri::MakeCharts( + dfResults = gsm.core::reportingResults_country, + dfGroups = gsm.core::reportingGroups_country, + dfMetrics = gsm.core::reportingMetrics_country, + dfBounds = gsm.core::reportingBounds_country + ) + + gsm.kri::Report_KRI( + lCharts = lCharts_country, + dfResults = gsm.kri::FilterByLatestSnapshotDate(gsm.core::reportingResults_country), + dfGroups = gsm.core::reportingGroups_country, + dfMetrics = gsm.core::reportingMetrics_country, + strOutputDir = here::here("pkgdown", "assets"), + strOutputFile = "report_kri_country.html" + ) + + + - name: Deploy pkgdown + run: | + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' diff --git a/_pkgdown.yml b/_pkgdown.yml index 9dfc68f..7e7c23a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -7,4 +7,11 @@ navbar: structure: left: [articles, Reports, reference, news] right: [search, github] - + components: + Reports: + text: Sample Reports + menu: + - text: Site Report + href: report_kri_site.html + - text: Country Report + href: report_kri_country.html From 18c7b82559f6216cb4b84d226d48e6e448581101 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:21:24 -0400 Subject: [PATCH 04/22] add data reporting vignette --- vignettes/DataReporting.Rmd | 367 ++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 vignettes/DataReporting.Rmd diff --git a/vignettes/DataReporting.Rmd b/vignettes/DataReporting.Rmd new file mode 100644 index 0000000..152e3bc --- /dev/null +++ b/vignettes/DataReporting.Rmd @@ -0,0 +1,367 @@ +--- +title: "Step-by-Step Reporting Workflow" +description: "This vignette walks users through the mechanics of the functions and workflows that produce the Reporting outputs in the {gsm} pipeline." +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Step-by-Step Reporting Workflow} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +library(gsm.core) +library(gsm.kri) +library(gsm.mapping) +library(gsm.reporting) +library(dplyr) +library(DT) +library(gt) +set.seed(1) + +dt <- function(data){ + data %>% + DT::datatable( + extensions = 'FixedColumns', + options = list( + scrollX = FALSE, + fixedColumns = TRUE + ), + rownames = FALSE + ) +} +``` + +# Introduction + +This vignette walks users through the mechanics of the functions and workflows that produce all of the Reporting output within the `{gsm.reporting}` package. The `{gsm}` suite of packages leverages Key Risk Indicators (KRIs) and thresholds to conduct study-level, country-level and site-level Risk Based Monitoring for clinical trials. + +These functions and workflows produce data frames, visualizations, metadata, and reports to be used in reporting and error checking at clinical sites. The image below illustrates the overarching context in which the reporting workflow runs, taking inputs from both the output of the analytics workflow, as well as raw study-, site-, and country-level data in the Raw/Raw+ format. + +![](data_model_detailed.png){width="100%"} + +All of the functions to create the data frames in the reporting data model will run automatically and sequentially when a user specifies the metadata and data needed for the report, and calls upon the `gsm.core::RunWorkflow()` function on the yaml files in the `workflow/3_reporting` directory. To create a report, the output of the reporting yamls is fed into the yamls in the `workflow/4_modules` directory to produce and html document with all charts and tables created in the reporting workflow. For a more detailed discussion of the yaml file and directory structure, see the [`{gsm.core}` Extensions vignette](https://gilead-biostats.github.io/gsm.core/articles/gsmExtensions.html)`). + +Each of the individual functions can also be run independently outside of a specified yaml workflow. + +For the purposes of this documentation, we will evaluate the input(s) and output(s) of each individual function for a specific KRI to show the stepwise progression of how a yaml workflow is set up to handle and process reporting-level data. + +------------------------------------------------------------------------ + +## Case Study - Step-by-Step Full Site-Level Report + +We will use sample clinical data simulated from the [`{gsm.datasim}`](https://github.com/Gilead-BioStats/gsm.datasim) package to run the full site-level report for all 12 KRIs included in this package. The focus of this vignette is the reporting workflow, so the output of the analytics workflow will be briefly discussed, but only in the context of *inputs* to the reporting workflow. + +Additional supporting functions are explored in [Appendix 1](#appendix-1). + +### Step 0 - Run Analysis Workflow(s) + +Prior to running the reporting model to create reporting data frames, charts and reports, the metrics we are reporting on must be properly calculated and flagged with the analysis workflow. For more information on the Analysis Workflow, see the associated `vignette("DataAnalysis")`. + +To run the analysis workflow on all 13 KRIs using `clindata` Raw+ data, use the code snippet below. From this, three pieces of output will be used in the reporting workflow: + +1. `lAnalysis` - list of data frames in the analysis data model +2. `lWorkflow` - list containing the metadata for each of the KRIs +3. `mapped$Mapped_SUBJ` - mapped data.frame of enrolled participants + +```{r include = TRUE, message = FALSE} +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", + "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") + +lSource <- gsm.core::lSource + +# Step 0 - Data Ingestion - standardize tables/columns names +mappings_wf <- MakeWorkflowList(strPath = "workflow/1_mappings", + strNames = core_mappings, + strPackage = "gsm.mapping") +mappings_spec <- CombineSpecs(mappings_wf) +lRaw <- Ingest(lSource, mappings_spec) + +# Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer +mapped <- RunWorkflows(mappings_wf, lRaw) + +# Step 2 - Create Metrics - calculate metrics using mapped data +metrics_wf <- MakeWorkflowList(strPath = "workflow/2_metrics", strNames = "kri", strPackage = "gsm.kri") +lAnalysis <- RunWorkflows(metrics_wf, mapped) +``` + + +### Step 1 - Create Reporting Model Data Frames + +With all necessary inputs to the reporting model created, we can move on to generate the reporting data model data frames. These data frames created are as follows: + +1. `dfGroups`: Group-level metadata dictionary. Created by passing CTMS site and study data to `MakeLongMeta()`. +2. `dfMetrics`: Metric-specific metadata for use in charts and reporting. Created by passing an `lWorkflow` object to `MakeMetric()`. +3. `dfResults`: A stacked summary of analysis pipeline output. Created by passing a list of + results returned by `Summarize()` to `BindResults()`. +4. `dfBounds`: Set of predicted percentages/rates and upper- and lower-bounds across the full range of sample sizes/total exposure values for reporting. Created by passing `dfResults` and `dfMetrics` to `MakeBounds()`. + +For more details on any of these tables, see `vignette("DataModel")`. + +The following sub-steps will dive into the creation and structure of each of these tables. Sample data for each of these tables can found in `{gsm}` as `reportingGroups`, `reportingMetrics`, `reportingResults` and `reportingBounds`. These sample tables are used throughout the package in examples and documentation. + + +#### Step 1.1 - Transform CTMS data into `dfGroups` data frame + +The `dfGroups` data frame is critical to providing site-, study- and country-level information in the final report. This table is based on CTMS data and the mapped `dfEnrolled` data frame created in the Analysis workflow. Creating this table requires the creation of 5 smaller tables that summarize the data at each group level using `RunQuery()` and `MakeLongMeta()`. These small tables are then bound together to create `dfGroups`. + +```{r include = TRUE, message = FALSE} +#Transform CTMS Site and Study Level data +dfCTMSSite <- gsm.core::RunQuery(df = lSource$Raw_SITE, + strQuery = "SELECT pi_number as GroupID, site_status as Status, pi_first_name as InvestigatorFirstName, pi_last_name as InvestigatorLastName, city as City, state as State, country as Country, * FROM df") |> + gsm.mapping::MakeLongMeta(strGroupLevel = 'Site') + +dfCTMSStudy <- gsm.core::RunQuery(df = lSource$Raw_STUDY, + strQuery = "SELECT protocol_number as GroupID, status as Status, * FROM df") |> + gsm.mapping::MakeLongMeta(strGroupLevel = 'Study') + +# Get Participant and Site counts for Country, Site and Study +dfSiteCounts <- gsm.core::RunQuery(df = mapped$Mapped_SUBJ, + strQuery = "SELECT invid as GroupID, COUNT(DISTINCT subjid) as ParticipantCount, COUNT(DISTINCT invid) as SiteCount FROM df GROUP BY invid") |> + gsm.mapping::MakeLongMeta(strGroupLevel = "Site") + +dfStudyCounts <- gsm.core::RunQuery(df = mapped$Mapped_SUBJ, + strQuery = "SELECT studyid as GroupID, COUNT(DISTINCT subjid) as ParticipantCount, COUNT(DISTINCT invid) as SiteCount FROM df GROUP BY studyid") |> + gsm.mapping::MakeLongMeta(strGroupLevel = "Study") + +dfCountryCounts <- gsm.core::RunQuery(df = mapped$Mapped_SUBJ, + strQuery = "SELECT country as GroupID, COUNT(DISTINCT subjid) as ParticipantCount, COUNT(DISTINCT invid) as SiteCount FROM df GROUP BY country") |> + gsm.mapping::MakeLongMeta(strGroupLevel = "Country") + + +# Combine CTMS and Counts data as dfGroups +dfGroups <- dplyr::bind_rows(SiteCounts = dfSiteCounts, + StudyCounts = dfStudyCounts, + CountryCounts = dfCountryCounts, + Site = dfCTMSSite, + Study = dfCTMSStudy) +``` + +The resulting `dfGroups` dataframe contains the following columns: + +- `GroupID`: Group Identifier +- `GroupLevel`: Type of Group specified in `GroupID` (Country, Site, Study) +- `Param`: Parameter Name (e.g. "Status") +- `Value`: Parameter Value (e.g. "Active") + +A more detailed explanation of the `Param`s for each group level can be found in `vignette("DataModel")`. + +```{r, echo = FALSE, warning = FALSE} +datatable(dfGroups) +``` + +#### Step 1.2 - Create `dfMetrics` Metadata + +The `dfMetrics` table contains the metadata for each of the KRIs in the report. This information comes from the `meta` section of the metric workflows, `metrics_wf` defined in Step 0. Using this workflow information as the input, `MakeMetric()` is used to produce a data frame with one row per metric. + + +```{r include = TRUE} +dfMetrics <- gsm.reporting::MakeMetric(lWorkflows = metrics_wf) +``` + +The resulting `dfMetrics` dataframe contains the following columns: + + - `File`: The yaml file for workflow + - `MetricID`: ID for the Metric + - `Group`: The group type for the metric (e.g. "Site") + - `Abbreviation`: Abbreviation for the metric + - `Metric`: Name of the metric + - `Numerator`: Data source for the Numerator + - `Denominator`: Data source for the Denominator + - `Model`: Model used to calculate metric + - `Score`: Type of Score reported + - `Threshold`: Thresholds to be used for bounds and flags + + +```{r, echo = FALSE, warning = FALSE} +datatable(dfMetrics) +``` + +#### Step 1.3 - Stack `dfSummary` data into `dfResults` + +The reporting workflow requires that all metrics are stacked into a single data frame, `dfResults`. This stacked data frame is created by feeding the `lAnalysis` list from the analysis workflow into `BindResults()` along with the snapshot date and the study id. + +```{r include = TRUE} +dfResults <- gsm.reporting::BindResults(lAnalysis = lAnalysis, + strName = "Analysis_Summary", + dSnapshotDate = Sys.Date(), + strStudyID = "ABC-123") +``` + +The resulting `dfResults` data frame contains the following columns: + + - `GroupID`: Group Identifier + - `GroupLevel`: Type of Group specified in `GroupID` (Country, Site, Study) + - `Numerator`: The calculated numerator value + - `Denominator`: The calculated denominator value + - `Metric`: The calculated rate/metric value + - `Score`: The calculated metric score + - `Flag`: The calculated flag + - `MetricID`: The Metric ID + - `StudyID`: The Study ID + - `SnapshotDate`: The Date of the snapshot + + +```{r, echo = FALSE, warning = FALSE} +datatable(dfResults) %>% +formatRound(columns = "Metric", digits = 3) +``` + +#### Step 1.4 - Create `dfBounds` for Confidence Intervals + +Several of the charts created for the KRI reports use confidence intervals and bounds to delineate the observations based on the flag they receive (no flag, amber or red). In order to create the data frame that contains the information about these boundaries, `dfBounds`, `dfResults` and `dfMetrics` is fed into the `MakeBounds()` function. The `MakeBounds()` function is a wrapper around the `Analyze_*_PredictBounds()` functions that create the bounds based on the model used to estimate the metric(Normal Approximation or Poisson). + +```{r include = TRUE} +dfBounds <- gsm.reporting::MakeBounds(dfResults = dfResults, + dfMetrics = dfMetrics) +``` + +The resulting `dfBounds` data frame contains the following columns: + + - `Threshold`: The number of standard deviations that the upper and lower bounds are based on + - `Denominator`: The calculated denominator value + - `LogDenominator`: The calculated log denominator value + - `Numerator`: The calculated numerator value + - `Metric`: The calculated rate/metric value + - `MetricID`: The Metric ID + - `StudyID`: The Study ID + - `SnapshotDate`: The Date of the snapshot + + +```{r, echo = FALSE, warning = FALSE} +datatable(dfBounds) %>% +formatRound(columns = "Metric", digits = 3) +``` + +### Step 2 - Create Visualizations + +Now that all of the data frames in the reporting data model have been created, we can create the charts that display this data in a useful and easily interpreted way. All four of the data frames created in Step 1 are fed into the `MakeCharts()` function to create all relevant charts given the input data. `MakeCharts()` is a wrapper around several helper functions that generate each static visualization and JS widget individually. Appendix 1 goes into more detail about each of these individual functions. + +```{r include = TRUE, warning = FALSE, message = FALSE} +lCharts <- gsm.kri::MakeCharts(dfResults = dfResults, + dfGroups = dfGroups, + dfBounds = dfBounds, + dfMetrics = dfMetrics) +``` + +The output of `MakeCharts` is a list containing the following charts: + - `scatterJS`: A scatter plot using JavaScript. + - `scatter`: A scatter plot using ggplot2. + - `barMetricJS`: A bar chart using JavaScript with metric on the y-axis. + - `barScoreJS`: A bar chart using JavaScript with score on the y-axis. + - `barMetric`: A bar chart using ggplot2 with metric on the y-axis. + - `barScore`: A bar chart using ggplot2 with score on the y-axis. + - `timeSeriesContinuousScoreJS`: A time series chart using JavaScript with score on the y-axis. + - `timeSeriesContinuousMetricJS`: A time series chart using JavaScript with metric on the y-axis. + - `timeSeriesContinuousNumeratorJS`: A time series chart using JavaScript with numerator on the y-axis. + +If the data only contains one snapshot data then the `timeseries` charts will not be created. + +Below are the static and interactive versions of the scatter plot for the AE KRI: + +```{r fig.height=6, fig.width=8, warning=FALSE} +lCharts$Analysis_kri0001$scatter + +lCharts$Analysis_kri0001$scatterJS +``` + + +### Step 3 - Generate Report + +All of the components are created to generate the HTML report for the study we are working on. In order to generate this report and save it locally, simply feed `lCharts`, `dfResults`, `dfGroups`, `dfMetrics` and (optionally) an absolute directory path and file to which the report will be saved (`strOutputDir` and `strOutputFile`, respectively) into `Report_KRI()` and the HTML output will be knit from the `Report_KRI.Rmd` template. All intermediate files from the knitting process will be saved in a temporary folder. + +```{r eval = FALSE, include = TRUE} +lReport <- gsm.kri::Report_KRI(lCharts = lCharts, + dfResults = dfResults, + dfGroups = dfGroups, + dfMetrics = dfMetrics, + strOutputFile = "test_kri_report.html") +``` + +Below, you will see a screenshot from the beginning of the report. All charts for all metrics that were included throughout the analysis and reporting workflows will be included in this report. + +![](report_screenshot.png){width="100%"} + +------------------------------------------------------------------- + +## Using YAML Workflows to generate reports + +While it is helpful to understand how each step of this process works, we have provided a series of YAML workflow files that make running reports on multiple KRIs easy and with the ability to be automated. + +Here, you will see how to run your workflows. The general approach is to run the analytics workflow(s), followed by the reporting workflow `data_reporting.yaml` followed by the charts and reports workflow `reports.yaml`. This allows the user to examine the output of each workflow individually before moving on to the next step. + +### Option 1 - Run All Workflows Separately + +```{r eval = FALSE, include = TRUE} +# Step 1 - Create Mapped Data - filter/map raw data +# Source Data +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", + "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") + +lSource <- gsm.core::lSource + +# Step 0 - Data Ingestion - standardize tables/columns names +mappings_wf <- gsm.core::MakeWorkflowList(strNames = core_mappings, strPath = "workflow/1_mappings", strPackage = "gsm.mapping") +mappings_spec <- gsm.mapping::CombineSpecs(mappings_wf) +lRaw <- gsm.mapping::Ingest(lSource, mappings_spec) + +# Step 1 - Create Mapped Data Layer - filter, aggregate and join raw data to create mapped data layer +mapped <- gsm.core::RunWorkflows(mappings_wf, lRaw) + +# Step 2 - Create Metrics - calculate metrics using mapped data +metrics_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/2_metrics", strPackage = "gsm.kri") +analyzed <- gsm.core::RunWorkflows(metrics_wf, mapped) + +# Step 3 - Create Reporting Layer - create reports using metrics data +reporting_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/3_reporting", strPackage = "gsm.reporting") +reporting <- gsm.core::RunWorkflows(reporting_wf, c(mapped, list(lAnalyzed = analyzed, lWorkflows = metrics_wf))) + +# Step 4 - Create KRI Report - create KRI report using reporting data +module_wf <- gsm.core::MakeWorkflowList(strPath = "workflow/4_modules", strPackage = "gsm.kri") +lReports <- gsm.core::RunWorkflows(module_wf, reporting) +``` + + +---------------------------------------------------------------- + +### Recap - Reporting Workflow + + - `dfGroups` created from CTMS data using `RunQuery()`, `MakeLongMeta()` and `bind_rows()` + - `dfMetrics` created from `lWorkflow` using `MakeMetric()` + - `dfResults` created from `lAnalysis$dfSummary` using `BindResults()` + - `dfBounds` created from `dfResults` using `MakeBounds()` + - List of all charts and tables (`lCharts`) created from `dfResults`, `dfBounds`, `dfMetrics` and `dfGroups` using `MakeCharts()` + - Report generated from `lCharts`, `dfResults`, `dfMetrics` and `dfGroups` using `Report_KRI()` + +----------------------------------------------------------------- + +# Appendix 1 - Supporting Functions {#appendix-1} + + +### Mapping Functions + + - `gsm.core::RunQuery()`: Run a SQL query to create new data.frames with filtering and column name specifications. + + +### Visualization Functions + +- `gsm.kri::Visualize_Scatter()`: Creates scatter plot of Total Exposure (in days, on log scale) vs Total Number of Event(s) of Interest (on linear scale). Each data point represents one site. Outliers are plotted in red with the site label attached. This plot is only created when statistical method is **not** defined as `identity`. Chart is called `scatter` in the `lCharts` object. +- `gsm.kri::Visualize_Score()`: Provides a standard visualization for Score or KRI. Charts are called `barScore` or `barMetric` in the `lCharts` object. +- `gsm.kri::Visualize_Metric()`: Creates all available charts and tables for a metric using the data provided. + + +### Widget Functions + +- `gsm.kri::Widget_GroupOverview()`: Creates an interactive table displaying the flag distribution for all groups across all metrics. +- `gsm.kri::Widget_BarChart()`: Creates an interactive bar chart visualization for Score or KRI. Charts are called `barScoreJS` or `barMetricJS` in the `lCharts` object. +- `gsm.kri::Widget_ScatterPlot()`: Creates an interactive scatter plot of Total Exposure (in days, on log scale) vs Total Number of Event(s) of Interest (on linear scale). Each data point represents one site. Outliers are plotted in red with the site label attached.Chart is called `scatterJS` in the `lCharts` object. +- `gsm.kri::Widget_TimeSeries()`: Creates an interactive time series scatter plot of the score, metric or numerator. Charts are called `timeSeriesContinuousScoreJS`, `timeSeriesContinuousMetricJS`, or `timeSeriesContinuousNumeratorJS` in the `lCharts` object. + +### Table Functions + +- `gsm.kri::Report_MetricTable()`: Creates a sortable table displaying the flags per group (e.g. Site, Country) for one metric at a time. + + From 5bfe7bde28fecf0a6549bbefd6c855636838781f Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:24:01 -0400 Subject: [PATCH 05/22] update pkgdown GHA triggers --- .github/workflows/pkgdown.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index dc424a9..5ac3416 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,6 +1,8 @@ on: - push: + pull-request: branches: [main, dev] + push: + branches: [main] workflow_dispatch: name: pkgdown-main From 0b582029ff904bdfa3049a9e70801b9e41a82e80 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:31:05 -0400 Subject: [PATCH 06/22] update description with suggests for vignette packages --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 08cf0ec..d6c47fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,8 @@ Remotes: gsm.qc=Gilead-BioStats/gsm.qc@main Suggests: knitr, - rmarkdown + rmarkdown, + DT, + gt VignetteBuilder: knitr URL: https://openrbqm.github.io/openrbqm/ From c98505e2677577b5c1e46feeb5b33beae89435c6 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:37:27 -0400 Subject: [PATCH 07/22] update gsm.datasim version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d6c47fd..18962dd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,7 @@ Imports: Remotes: gsm.core=Gilead-BioStats/gsm.core@main, gsm.app=Gilead-BioStats/gsm.app, - gsm.datasim=Gilead-BioStats/gsm.datasim@dev, + gsm.datasim=Gilead-BioStats/gsm.datasim@fix-37, gsm.kri=Gilead-BioStats/gsm.kri@main, gsm.mapping=Gilead-BioStats/gsm.mapping@main, gsm.reporting=Gilead-BioStats/gsm.reporting@main, From 00a5e76bf8329394a3a9402e837b275c8133c755 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:51:47 -0400 Subject: [PATCH 08/22] pkgdown typo --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 5ac3416..db40e2f 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,5 +1,5 @@ on: - pull-request: + pull_request: branches: [main, dev] push: branches: [main] From 406eb88ef2cb451d7bf6e123ead09b1a5617faed Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:57:24 -0400 Subject: [PATCH 09/22] get rid of `main` refs in pkgdown yaml --- .github/workflows/pkgdown.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index db40e2f..af550a2 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -5,7 +5,7 @@ on: branches: [main] workflow_dispatch: -name: pkgdown-main +name: pkgdown jobs: pkgdown-main: @@ -17,8 +17,6 @@ jobs: steps: - uses: actions/checkout@v4 - with: - ref: main - uses: r-lib/actions/setup-pandoc@v2 @@ -34,9 +32,10 @@ jobs: - name: Generate sample reports shell: Rscript {0} run: | - devtools::load_all() + devtools::load_all(dependencies = T) dir.create(here::here("pkgdown", "assets"), recursive = TRUE) library(gsm.kri) + library(gsm.core) lCharts <- gsm.kri::MakeCharts( dfResults = gsm.core::reportingResults, From 89a9d194cfaa2c03b430847b6b28c5042c1e3da0 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 11:58:36 -0400 Subject: [PATCH 10/22] typo --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index af550a2..b0614fa 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -32,7 +32,7 @@ jobs: - name: Generate sample reports shell: Rscript {0} run: | - devtools::load_all(dependencies = T) + devtools::load_all() dir.create(here::here("pkgdown", "assets"), recursive = TRUE) library(gsm.kri) library(gsm.core) From cba4f9cbcebb96201eee4dff8111d2fd092b70ad Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 12:01:15 -0400 Subject: [PATCH 11/22] install devtools in pkgdown --- .github/workflows/pkgdown.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index b0614fa..65b84a5 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -8,7 +8,7 @@ on: name: pkgdown jobs: - pkgdown-main: + pkgdown: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -26,7 +26,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown + extra-packages: any::pkgdown, any::devtools needs: website - name: Generate sample reports From 9d39450d838a3086eb7cf49f80e086f77879ef80 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 14:35:44 -0400 Subject: [PATCH 12/22] install devtools and here in pkgdown --- .github/workflows/pkgdown.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 65b84a5..676374e 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -26,12 +26,13 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, any::devtools + extra-packages: any::pkgdown needs: website - name: Generate sample reports shell: Rscript {0} run: | + install.packages("devtools", "here") devtools::load_all() dir.create(here::here("pkgdown", "assets"), recursive = TRUE) library(gsm.kri) From 4db828f9cc4ba54d57db5915b7e079be0b2555fb Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 14:38:44 -0400 Subject: [PATCH 13/22] typo --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 676374e..e0ece29 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -32,7 +32,7 @@ jobs: - name: Generate sample reports shell: Rscript {0} run: | - install.packages("devtools", "here") + install.packages(c("devtools", "here")) devtools::load_all() dir.create(here::here("pkgdown", "assets"), recursive = TRUE) library(gsm.kri) From c79b5cfe5d743be456876ae8b4ace1bf3dbbb07c Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 14:46:44 -0400 Subject: [PATCH 14/22] remove duplicate vignettes --- vignettes/ContributorGuidelines.Rmd | 225 ---------------------------- vignettes/KRIMethod.Rmd | 223 --------------------------- 2 files changed, 448 deletions(-) delete mode 100644 vignettes/ContributorGuidelines.Rmd delete mode 100644 vignettes/KRIMethod.Rmd diff --git a/vignettes/ContributorGuidelines.Rmd b/vignettes/ContributorGuidelines.Rmd deleted file mode 100644 index 84b7d78..0000000 --- a/vignettes/ContributorGuidelines.Rmd +++ /dev/null @@ -1,225 +0,0 @@ ---- -title: "Contributor Guidelines" -description: "This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request." -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Contributor Guidelines} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -library(gsm.core) -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Introduction - -This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request. - -## Prerequisites - -Before contributing code via a Pull Request, make sure to file an [issue](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) using one of the pre-specified issue templates. Choose the template that best categorizes what you aim to contribute, which generally can be one of the following: - -- Bugfix Issue: Fix a bug in the code -- Feature Issue: Develop a new feature -- QC Issue: Update QC framework, including documentation, qualification, automation, etc. - -Someone from the development team will decide if the issue is in scope. If so, the issue will be appropriately triaged and assigned to a core developer, or approval to submit a Pull Request associated with the submitted issue will be granted. If it is decided that the issue is out of scope or otherwise irrelevant, the issue will be closed. - -The issue templates provide comments/prompts to help ensure that all relevant information is included. When submitting issues for bug fixes or specific feature requests, it is often helpful to provide a minimal [reprex](https://www.tidyverse.org/help/#reprex), or reproducible example, to help the core developers visualize the issue. - -Suggestions or other input that might not warrant formal submission of an issue can be filed under [discussions](https://github.com/Gilead-BioStats/gsm.core/discussions), which can help facilitate discourse of specific use-cases or requests. - -## Branches - -The core branches that are used in this repository are: - -- `main`: Contains the production version of the package. -- `dev`: Contains the development version of the package. -- `fix`: Used to develop new functionality in the package. See [Development Process](#development-process) below for more details. -- `release`: Used to conduct regression testing and finalize QC documentation for a release. See [Release Process](#release-process) below for more details. - -# Development Process {#development-process} - -All code development takes place in `fix` branches. This section provides general guidance about this process flow. A detailed step-by-step workflow for code development in `fix` branches can be found in the first section of [Appendix 1](#fix-branch-workflow) below. - -Once an issue is filed and delegated to a core developer, a `fix` branch will be opened, which is where all package development related to that issue will be conducted. Each `fix` branch should be linked to one or more of the filed GitHub [issue(s)](https://github.com/Gilead-BioStats/gsm.core/issues). The issue(s) will be referenced in the naming of the `fix` branch. For example, a branch named `fix-111` addresses issue #111. Tasks related to documentation, testing, and/or qualification may also use `fix` branches and associated issues. - -In addition to the above, please also use the following general guidelines when creating a Pull Request: - -- New code should generally follow the [tidyverse style guide](https://style.tidyverse.org/), but automatic styling will be applied before each release. More details about the style guide can be found [here](#style-guide). -- Documentation should be included, using the [roxygen2](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) package. -- New functions or changes to existing functions should include updated unit tests to demonstrate branch compatibility. Core developers request that unit tests are developed using [testthat \>= v3.0.0](https://testthat.r-lib.org/). -- Please include any relevant details that will provide context for the proposed updates or new functionality. Additionally, link the Pull Request to the relevant issue(s) by using either [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword), or the `Development` section on the sidebar of the Pull Request page. -- In general, all Pull Requests should target the `dev` branch (with the exception of a release Pull Request). -- All checks and tests must be passing before merging a Pull Request to `dev`. These checks are automatically run via GitHub Actions, as described in [Appendix 3](#github-action-workflow), but you can also run them locally by calling `devtools::check()` on your `fix` branch before finalizing a Pull Request. -- During the development process, developers should verify that the `qualification-dev-check` Action runs successfully. This checks whether the qualification tests were executed successfully or failed. This can be verified by looking under the Checks tab of a given Pull Request - the status will read either "Success" or "Failure". If there are any conflicts present or new functionality is added to code that is not covered by existing qualification tests, the developer submitting the Pull Request should add this information to the version qualification issue. - -# Release Process {#release-process} - -Code release follows a process using `release` branches. A release is initiated when all feature development, QC, and qualification has been completed for a given functionality. The primary objective of the Release Workflow is to conduct regression testing and finalize all QC documentation for that release. A detailed step-by-step workflow for code release can be found in the second section of [Appendix 1](#release-branch-workflow) below. - -# Style Guide {#style-guide} - -Code developers for `{gsm}` use the [tidyverse style guide](https://style.tidyverse.org/) with minimal modifications. The code below is run to standardize styling before each release: - - double_indent_style <- styler::tidyverse_style() - double_indent_style$indention$unindent_fun_dec <- NULL - double_indent_style$indention$update_indention_ref_fun_dec <- NULL - double_indent_style$line_break$remove_line_breaks_in_fun_dec <- NULL - styler::style_dir('R', transformers = double_indent_style) - styler::style_dir('tests', recursive = TRUE, transformers = double_indent_style) - -# Appendix 1 - Detailed Workflows - -## `fix` Branch Workflow {#fix-branch-workflow} - -1. Create issue(s) defining addition(s) and/or revision(s): - - Select the appropriate [template](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) to use (should be one of the following): - - `Bugfix Issue` - - `Feature Issue` - - `QC Issue` - - Assign issue(s) to core developer(s). - - Assign milestone within issue(s). -2. Developer creates `fix` branch (with nomenclature reflecting associated issue(s)) and updates the associated code(s). -3. Developer opens Pull Request for the `fix` branch to be merged into the `dev` branch using the GitHub default Pull Request template. Developer should do the following: - - Assign Pull Request to self. - - Requests review(s). - - Assign milestone. - - Link to associated issue(s). -4. Before the `fix` branch can be merged into the `dev` branch, the Pull Request must: - - Be approved by assigned code reviewer(s). - - Pass all GitHub qualification checks. -5. `fix` branch is merged into the `dev` branch after the above requirements are fulfilled. The user who merges the `fix` branch should make sure to delete it upon merging. - -## `release` Branch Workflow {#release-branch-workflow} - -1. Release Owner creates `release` branch from `dev` branch. - - The `release` branch should be named according to the version of the package being released (e.g., `release-v1.2.0`) using [semantic versioning](https://semver.org/). - - If a release branch is already created, make sure that it is synced with the current `dev` branch. -2. Release Owner prepares the release for QC by performing the following steps and pushing updates to the `release` branch: - - Confirm that the version in the `DESCRIPTION` file is up to date. - - Run `styler` using the script from the [style guide](#style-guide) above (or by running `gutil::style_code()`) and commit any updates. - - Update `NEWS.md` with a summary of the revisions/additions in the release. Keep any information from previous releases to maintain traceability through versions. - - Ensure that the qualification specifications spreadsheet is up-to-date and accurate. If there have been any changes/updates to qualification tests, reach out to the qualification developer to update any necessary files. - - If applicable, review `README.md` and relevant vignettes to make sure updates are accurately described. - - Ensure all unit tests are passing. - - Check if all qualification tests are passing and if new features were added that need to be qualified. If updates are needed, they should be outlined in a release QC issue. - - Run `devtools::spell_check()` and resolve findings. - - Build site using `pkgdown::build_site()`. Check that all examples are displayed correctly and that all new functions occur on the Reference page. - - Open a clean R session. Run `devtools::install()` and then `devtools::check()` locally and confirm that there are no issues/conflicts. -3. Release Owner creates Pull Request from the `release` branch to the `main` branch: - - Use the [release Pull Request template](https://github.com/Gilead-BioStats/gsm.core/blob/dev/.github/PULL_REQUEST_TEMPLATE/release.md) by adding `?template=release.md` to the URL when creating the Pull Request. The user can also click the link, then click `Raw`, and copy/paste the displayed Markdown into the Pull Request. - - Assign Pull Request to self. - - Request QC review(s). - - Assign milestone. - - Complete Risk Assessments for each Assessment/Feature added as outlined in the Pull Request template. - - Create comments in the Pull Request with a unique [QC checklist](#appendix-2---qc-checklist) for each selected Assessment/Feature (See [example for v0.1.0](https://github.com/Gilead-BioStats/gsm.core/pull/194)). -4. QC Reviewer(s) conduct(s) review by: - - Completing all QC checklists in the Pull Request. - - Ensuring all GitHub Actions on the Pull Request to the `main` branch are passing. -5. QC Reviewer(s) approve(s) Pull Request or request(s) changes. If changes are needed: - - QC Reviewer(s) should file issues and the development team should follow the standard package development process using `fix` branches. - - Once issues are resolved and merged to the `dev` branch, Release Owner can merge the `dev` branch into the `release` branch, and re-request review. - - If needed, the original Pull Request can be closed and a new release Pull Request can be created with a Release Candidate (RC) value added to the branch name (e.g., `release-v1.2.0-RC2`) -6. Once the Pull Request is approved, the Release Owner should complete the release by taking the following steps: - - Merge the release Pull Request to the `main` branch. - - Create the GitHub release targeting the `main` branch using the wording from `NEWS.md`, in addition to the automatically generated content in GitHub. - - Confirm that the QC Report is attached to release. -7. Finally, the Release Owner (or qualified designee) should complete the following housekeeping tasks: - - Create a Pull Request to merge the `main` branch into the `dev` branch to sync any updates that were made during release process. - - Check that all issues associated with the current release are closed. - - Update the milestone for any incomplete tasks. - - Delete code branches associated with previous releases. - - Close the milestone and project associated with the previous release. - -# Appendix 2 - QC Checklist {#appendix-2---qc-checklist} - -This QC checklist is to be used as part of the Development and Release Workflows described above. When applied to an Assessment/Feature, confirm that each function meets the requirements described. When applied to utility or other functionality, use relevant sections of the checklist and modify QC checks as needed. A risk-based approach will be used to determine whether each release requires a high-level or detailed release QC. - -### High-Level QC Checklist - -- [ ] Documentation - - [ ] New functionality contains an `@export` tag. - - [ ] New functionality contains an adequate level of documentation. -- [ ] Error Checking - - [ ] New functionality has associated unit test(s). - - [ ] Tests confirm that the input data has required columns (if any). - - [ ] Tests confirm that the output data has expected columns/structure. - - [ ] Tests confirm intended functionality for each parameter. -- [ ] Data Model - - [ ] Running `Make_Snapshot()` using defaults returns no errors. - - [ ] Running `CheckSnapshotInputs()` using the output of `Make_Snapshot()$lSnapshot` returns no errors. - - [ ] Running `Study_Assess()` using defaults returns no errors. - - [ ] Running `Study_Report()` using the output of `Study_Assess()` from above produces a HTML report that accurately displays results. -- [ ] Basic QC - - [ ] Assessment has User Requirements + Qualification tests captured using qualification framework. A QC report is generated as expected and all checks pass successfully. - - [ ] Code is well commented and easy to read/understand. - - [ ] Qualification specifications spreadsheet (`qualification_specs.csv`) has been reviewed and approved by a qualification developer. - - [ ] No file paths or other company-specific data are present. - - [ ] `devtools::check()` passes with no errors/warnings/notes. - - [ ] Package documents are up to date, and running `devtools::document()` does not change any files. - - [ ] Codes use `{tidyverse}` best practices for standard data manipulation. If unclear, reviewer should start a discussion thread. - - [ ] All new dependencies add significant value. If unclear, reviewer should start a discussion thread. - - [ ] All GitHub Actions run with no errors. - -### Detailed QC Checklist - -- [ ] Documentation - - [ ] Function name captured in [roxygen2 title](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#the-description-block) (e.g., "Adverse Event Assessment") - - [ ] Assessment purpose captured in [roxygen2 description](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#the-description-block) (e.g., "Evaluates adverse event (AE) rates to identify sites that may be over- or under-reporting AEs") - - [ ] Input data requirements are captured in a dedicated [roxygen2 details section](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#sections) under *Data specification* (`#' @section Data specification`, or *\# Data specification* if storing data specification in a `.md` file). - - [ ] Statistical methods and assumptions are captured in a dedicated [roxygen2 details section](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#sections) under *Statistical assumptions* (`#' @section Statistical assumptions`, or *\# Statistical assumptions* if storing statistical assumptions in a `.md` file). This section should link to the relevant `Analyze_` function(s) for further details. - - [ ] All function parameters are described with a [`@param` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). Each parameter description should include its name, type, purpose, usage details, default value (if applicable), requirement, and valid options (if applicable). - - [ ] All external dependencies are captured. Use `@importFrom _package_ _function_` when importing five (5) or fewer functions, and `@import _package_` otherwise. - - [ ] Function output is captured with a [`@return` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). Each output description should include output type, structure, and data specification (if applicable). - - [ ] At least one (1) example is provided under an [`@examples` tag](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd.html#functions). -- [ ] Error Checking - - [ ] Basic checks for all parameters should be included using `stopifnot()` or similar logic (e.g., `stopifnot("dfInput is not a data frame" = is.data.frame(dfInput))`) - - [ ] Tests confirm that `stopifnot()` parameter checks are working as expected. - - [ ] Tests confirm that the input data has required columns (if any). - - [ ] Tests confirm that the output data has expected columns/structure. - - [ ] Tests confirm intended functionality for each parameter. - - [ ] Tests confirm that missing data in required columns is handled appropriately and errors/warnings are produced if needed. -- [ ] Basic QC - - [ ] Assessment has User Requirements + Qualification tests captured using qualification framework. A QC report is generated as expected and all checks pass successfully. - - [ ] Code is well commented and easy to read/understand. - - [ ] Qualification specifications spreadsheet (`qualification_specs.csv`) has been reviewed and approved by a qualification developer. - - [ ] No file paths or other company-specific data are present. - - [ ] Function called from non-tidyverse dependencies are called via `::`. - - [ ] `devtools::check()` passes with no errors/warnings/notes. - - [ ] Package documents are up to date, and running `devtools::document()` does not change any files. - - [ ] Codes use `{tidyverse}` best practices for standard data manipulation. If unclear, reviewer should start a discussion thread. - - [ ] All new dependencies add significant value. If unclear, reviewer should start a discussion thread. - - [ ] All GitHub Actions run with no errors. - -# Appendix 3 - Continuous Integration with GitHub Actions {#github-action-workflow} - -GitHub Actions are used in all `{gsm}` packages to automate processes and ensure all code and documentation is created consistently and documented thoroughly. - -## Merges to `dev` Branch - -- R CMD Check (`R-CMD-check-dev`): - - Basic R CMD check which can be run using `rcmdcheck::rcmdcheck()` - - Provides an additional check for the ability to build the `pkgdown` reference index and ensure that all functions are documented correctly. This check will run on `ubuntu-latest` and on R version 4.1.3. -- Build Markdown (`build-markdown`): - - Builds Assessment Specification tables from function documentation - - Outputs are added to `man`/`.md` and any changes are committed to the compare branch or the triggering Pull Request. -- Test Coverage (`test-coverage`): - - Uses `{covr}` to check the package coverage. -- Qualification Check (`qualification-check-dev`): - - Runs the qualification tests but will not fail if any of the tests do not pass. Developers should review this check when changes that might need updates to qualification are done. - -## Merges to `main` Branch - -- R CMD Check (`R-CMD-check-main`): - - Basic R CMD check which can be run using `rcmdcheck::rcmdcheck()` - - Provides an additional check for the ability to build the `pkgdown` reference index and ensure that all functions are documented correctly. The check will also run all qualification tests to ensure that the release is fully qualified. This check will run on `ubuntu-latest` and on R version 4.1.3. Additionally, it will be run on the latest R release version on `windows-latest`, `macOS-latest`, and `ubuntu-latest`. -- `pkgdown`: - - Builds the [pkgdown site](https://gilead-biostats.github.io/gsm.core/) for the relevant package (`{gsm.core}` in this case). -- Qualification Report (`qualification-report`): - - Builds the qualification vignette as an attached artifact to the Pull Request. This should be reviewed by the Pull Request Owner for completeness and correctness to ensure that the artifact added to the release is correct. diff --git a/vignettes/KRIMethod.Rmd b/vignettes/KRIMethod.Rmd deleted file mode 100644 index ec1bd25..0000000 --- a/vignettes/KRIMethod.Rmd +++ /dev/null @@ -1,223 +0,0 @@ ---- -title: "KRI Method" -description: "This vignette outlines the statistical methods used to evaluate Key Risk Indicators (KRIs) in {gsm}." -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{KRI Method} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -library(gsm.core) -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Overview - -This vignette outlines the statistical methods used to evaluate Key Risk Indicators (KRIs) in the {gsm} suite of packages. KRIs are metrics that allow users to measure pre-defined risks and determine the level of observed risk to data quality and patient safety in a clinical trial. The {gsm} suite of packages implements a standardized data pipeline to facilitate KRI analysis. Other vignettes provide an overview of this framework ([1](Cookbook.html) [2](DataModel.html), [3](DataAnalysis.html), -[4](https://gilead-biostats.github.io/gsm.reporting/articles/DataReporting.html)), and the statistical methods for this process are described in detail below. - -`{gsm.core}` calculates KRIs by defining a numerator and a denominator for each metric. Then by default, `{gsm.core}` calculates z-scores using a normal approximation with adjustment for over-dispersion to assign risk levels. - -For KRIs that are percentages (binary outcome), the numerator is the # of events and the denominator is the # of total participants, and we then apply the normal approximation of the binomial distribution to determine a risk level. - -For KRIs that are rates (count outcome), the numerator is the # of events and the denominator is the total participant exposure or study duration, and we then apply the normal approximation of the Poisson distribution to determine a risk level. - -Alternative statistical methods to calculate standardized scores are also available in `{gsm.core}`, including the Identity, Fisher and Poisson methods. More details are provided below. - -# Statistical Methods - -## 1. The Normal Approximation Method - -### Introduction - -This method applies normal approximation of binomial distribution to the binary outcome KRIs, or normal approximation of Poisson distribution for the rate outcome KRIs (the sample sizes or total exposure of the sites) to assess data quality and safety. The control limits based on the asymptotic normal approximation are constructed to as risk thresholds for identifying site-level risks. - -Reference: Zink, Richard C., Anastasia Dmitrienko, and Alex Dmitrienko. **Rethinking the clinically based thresholds of TransCelerate BioPharma for risk-based monitoring.** *Therapeutic Innovation & Regulatory Science* 52, no. 5 (2018): 560-571. - -### Methods - -#### Binary - -Consider the problem of monitoring KRIs with binary outcomes, such as protocol deviation or discontinuation from the study, across multiple sites in a clinical trial. Assume that there are $m$ sites with $n_i$ patients at the $i$ th site, $i = 1, 2, \dots, m$. Denote the total number of patients in the study by $n=\sum_{i=1}^m n_i$. Let $X_{ij}$ signify the outcome of interest for the $j$ th patient at the $i$ th site, where $X_{ij}=1$ indicates that an event has occurred and indicates that an event has not occurred. Finally, let $p_i$ denote the site-level proportion at the $i$ th site. Monitoring tools focus on testing the null hypothesis of consistency of the true site-level proportion across multiple sites. Specifically, the null hypothesis states that the site-level proportion of the binary outcome is constant across the sites, that is, $H_0: p_1 = \dots = p_m = p$, where $p$ is the common proportion. This common proportion can be estimated as $\hat{p} = \frac{1}{n}\sum_{i=1}^m\sum_{j=1}^{n_i}X_{ij}$. - -The control limits are computed using confidence limits based on an asymptotic normal approximation. A 95% -confidence interval is obtained if the significance level $\alpha=0.05$. Let $X_i=\sum_{j=1}^{n_i}X_{ij}$ represent the total number of events that occur and let $\hat{p}_i=X_i/n_i$ denote the estimated event rate at the $i$ th site. The asymptotic $100(1 – \alpha)%$ confidence interval for $p_i$ is given by $$ -\hat{p}_i-z_{1-\alpha/2}\sqrt{\frac{\hat{p}_i(1-\hat{p}_i)}{n_i}} \leq p_i \leq \hat{p}_i+z_{1-\alpha/2}\sqrt{\frac{\hat{p}_i(1-\hat{p}_i)}{n_i}} -$$ where $z_{1-\alpha/2}$ is the upper percentile of the standard normal distribution. To construct the control limits for the observed event rate at this site, the estimated event rate is forced to be equal to the overall event rate $\hat{p}_i$. This means that the lower (l) and upper (u) asymptotic control limits for the $i$ th site are defined as $l_i=\hat{p}-z_{1-\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$ and $u_i=\hat{p}+z_{1-\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$, respectively. Asymptotic control limits may not be reliable in smaller clinical trials, so exact limits for an event rate may be preferable. - -#### Rate - -Assume that the distribution of number of events up to time $T$ is Poisson with mean $\lambda t$, where $\lambda$ is the event rate for a given unit of time. For the $i$ th site with $X_i=\sum_{j=1}^{N_i}X_{ij}$ events and $T_i=\sum_{j=1}^{N_i}t_{ij}$ exposure, define the exposure-adjusted incidence rate (EAIR) as $\hat{\lambda}_i=\frac{X_i}{T_i}$. For all sites, define $X=\sum_{i=1}^{m}X_{i}$ and $T=\sum_{i=1}^{m}t_{i}$ with $\hat{\lambda}=\frac{X}{T}$. Under a normal approximation, $100(1 – a)%$ confidence interval for the $i$ th site is $$ -\hat{\lambda}_i-z_{1-\alpha/2}\sqrt{\frac{\hat{\lambda}_i}{T_i}} \leq p_i \leq \hat{\lambda}_i+z_{1-\alpha/2}\sqrt{\frac{\hat{\lambda}_i}{T_i}} -$$. For these funnel plots accounting for exposure, the x-axis representing the site sample size ($n$) in the above examples is replaced by the total exposure time $T$. To develop a funnel plot, fix $\hat{\lambda}_i=\hat{\lambda}$, and vary $T$ from $min(T_i)$ to $max(T_i)$ to compute the control limits. As an area of future research, the work of Chan and Wang (2009) may suggest methods appropriate for computing an exact confidence interval for the EAIR. Finally, similar methods can be applied for a count-type endpoint $X_{ij}$, where tij would denote the time on study for the $j$ th patient at the $i$ th site. - -### KRI Metric and Z-score - -The KRI metric along with a KRI score are created for each site to measure the level of observed risk to data quality and patient safety in a clinical trial. For scoring purposes, Z-scores from the normal approximation are calculated and defined as such: $z_i=\frac{y_i-\theta_0}{\sqrt{V(Y|\theta_0)}}$ for site $i$, where $y_i$ is the KRI metric calculated for site $i$, $\theta_0$ is the overall mean, $\sqrt{V(Y|\theta_0)}$ is the measurement of variance. - -For binary outcome, $\sqrt{V(Y|\theta_0)}=\sqrt{\frac{\hat{p}(1-\hat{p})}{n_i}}$. - -For rate outcome, $\sqrt{V(Y|\theta_0)}=\sqrt{\frac{\hat{\lambda}}{T_i}}$. - -### Over-dispersion adjustment - -The standard normal approximation method described above assumes the null distribution fully expresses the variability of the sites in-control, but in many situations this assumption will not hold. In the situation that there is a presence of greater variability than expected, majority of the sites will fall outside the specified limits, leading to a double of the appropriateness of the constructed limits. - -A way of handling this issue is to allow over-dispersion in the normal approximation. A multiplicative over-dispersion adjustment was implemented in our approach. - -Suppose a sample of $m$ units are to be **in-control**, the over-dispersion factor $\phi$ can be estimated as the mean squared z-scores, i.e., $\hat\phi = \frac{1}{m}\sum_{i=1}^m z_i^2$. -For binary outcome, the over-dispersion adjusted variance is $V'(Y_i|\phi, p)=\phi\frac{{p}(1-p)}{n_i}$. -For rate outcome, the over-dispersion adjusted variance is $V'(Y_i|\phi, \lambda)=\phi\frac{\lambda}{T_i}$. -Therefore, after the over-dispersion adjustment, the adjusted z-scores for site $i$ are $z_i = \frac{\hat{p}_i - \hat{p}}{\sqrt{\hat\phi \frac{{\hat{p}}(1-\hat{p})}{n_i}}}$, $z_i = \frac{\hat{\lambda}_i - \hat{\lambda}}{\sqrt{\hat\phi \frac{\hat\lambda}{T_i}}}$, respectively. - -Reference: Spiegelhalter, David J. **Funnel plots for comparing institutional performance.** *Statistics in medicine* 24.8 (2005): 1185-1202. - -### Estimate and Score - -The function `Analyze_NormalApprox()` in `{gsm.core}` calculates adjusted z-score for each site as discussed above. The adjusted z-scores are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. - -### Threshold - -By default, sites with adjusted z-score exceeding $\pm 2$ or $\pm 3$ from the normal approximation analysis are flagged as amber or red, respectively. The thresholds are set at common choices corresponding to 95.6% and 99.7% of the data around the mean in a standard normal distribution. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. - -### Special Situations - -1. Results are not interpretable or it is not appropriate to apply the asymptotic method: We don't want to flag in certain situations when results not interpretable or when it is not appropriate to apply the asymptotic method due to the small sample sizes. The default threshold for minimum denominator requirement is 30 days exposure or 3 patients at the site level. - -### Recommendation - -Normal approximation method can be used in all scenarios with binary or rate KRIs. - -## 2. The Identity Method - -Identity method simply uses the count of event in the numerator of the KRI metric itself as the score. The thresholds for monitoring site risk are set based on the actual counts. - -## 3. The Fisher's Exact Method - -### Introduction - -For the binary outcome KRIs, an optional method in `{gsm.core}` is implemented with Fisher's exact test. - -Fisher's exact test is a statistical significance test used in the analysis of contingency tables when we have nominal variables and want to find out if proportions for one variable are different among values of the other variables. - -In contrast to large-sample based asymptotic statistics which rely on approximation, Fisher's exact test can be applied when sample sizes are small. - -The function `Analyze_Fisher` in `{gsm.core}` utilizes `stats::fisher.test` to generate an estimate of odds ratio as well as p-value using the Fisher's exact test with site-level count data. For each site, Fisher's exact test is conducted by comparing to all other sites combined in a 2×2 contingency table. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers. The default in `stats::fisher.test` uses a two-sided test (equivalent to testing the null of OR = 1) and does not compute p-values by Monte Carlo simulation unless `simulate.p.value = TRUE`. Sites with p-values less than 0.05 from the Fisher's exact test analysis are flagged by default. The significance level was set at a common choice. - -### Methods - -For example, in a $2 \times 2$ contingency table comparing a particular site to all other sites combined, the two rows displaying the binary outcome are considered repeated Bernoulli random samples with same probability $p=0.5$ of success or failure under the null. Given a $2 \times 2$ contingency table, - -```{r echo = FALSE, results = 'asis'} -library(gt) -table1<-data.frame(Site1=c("a","b"), RestSites=c("c","d")) -rownames(table1)<-c("Yes", "No") -gt::gt(table1) -``` - -Fisher (1922) showed that conditional on the margins of the table, $a$ is distributed as a hypergeometric distribution with $a+c$ draws from a population with $a+b$ successes and $c+d$ failures. Let $n=a+b+c+d$, the probability of obtaining such set of values is given by: - -$$ -p=\frac{{{a+b} \choose a} {{c+d} \choose c}}{{n \choose {a+c}}}=\frac{{{a+b} \choose b} {{c+d} \choose d}}{{n \choose {b+d}}}=\frac{(a+b)!(c+d)!(a+c)!(b+d)!}{a! b! c! d! n!}. -$$ - -### Estimate and Score - -The function `Analyze_Fisher()` in `{gsm.core}` utilizes `stats::fisher.test()` to generate an estimate of odds ratio as well as p-value using the Fisher's exact test with site-level count data. For each site, Fisher's exact test is conducted by comparing to all other sites combined in a $2 \times 2$ contingency table. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. The default in `stats::fisher.test()` uses two-sided test (equivalent to testing the null: OR=1) and not to compute p-values by Monte Carlo simulation unless `simulate.p.value = TRUE` is specified. - -### Threshold - -By default, sites with p-values less than 0.05 or 0.01 from the Fisher's exact test analysis are flagged as amber or red, respectively. The thresholds are set based on empirical p-value approach, where we use the distribution of the p-values to find the best separation of the data to identify sites at risk. The default thresholds are set at common choices of significance levels. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. - - -### The Fisher's exact test assumptions - -1. The row totals and the column totals are both fixed by design. - -2. The samples are mutually exclusive and mutually independent. - -The assumptions can be assessed by the knowledge of data collected. No assumption check is necessary. - -### Special situations - -1. Functionally: where we don't have required input to run Fishers: p-value will be set `NA`. - -2. Results not interpretable: we don't want to flag in certain situations when results not interpretable due to small sample sizes. The default threshold for minimum denominator requirement is 3 patients at the site level. - -3. An observed zero cell is not an issue when using Fisher's exact test, however, when the expected cell is zero, it means either the marginal is zero (meaningless) or there are structural zeros (need to consider zero-inflated issue: West, L. and Hankin, R. (2008), "Exact Tests for Two-Way Contingency Tables with Structural Zeros," Journal of Statistical Software, 28(11), 1--19). - -### constraints - -For small samples, Fisher's exact test is highly discrete. Fisher's exact test is often considered to be more conservative. This may due to the use a discrete statistic with fixed significance levels ([FET Controversies Wiki](https://en.wikipedia.org/wiki/Fisher%27s_exact_test#Controversies)). - -Although in practice, Fisher's exact test is usually used when sample sizes are small (e.g., n\<5), it is valid for all sample sizes. However, when sample sizes are large, the computation of the exact test evaluating the hypergeometric probability function given the marginal can take a very long time. - -### Recommendation - -Fisher's exact test can be used in all scenarios with binary KRIs. - - -## 4. The Poisson Regression Method - -### Introduction - -For the rate outcome KRIs, an optional method in `{gsm.core}` is implemented with Poisson regression. - -The Poisson distribution is often used to model count data. If $Y$ is the number of counts following Poisson distribution, the probability mass function is given by $$ -f(y)=\frac{\mu^ye^{-\mu}}{y!} -$$ where $\mu$ is the average number of counts and $E(Y)=Var(Y)=\mu$. - -### Methods - -This method fits a Poisson model to site-level data and then calculates deviance residuals for each site. The Poisson model is run using standard methods in the `stats` package by fitting a `glm` model with family set to `poisson` using a "log" link. Site-level deviance residuals are calculated using `resid` from `stats::predict.glm` via `broom::augment`. - -Let $Y_1, ..., Y_N$ be independent random variables with $Y_i \sim Poisson(\mu_i)$ denoting the number of events observed from $n_i$ for the $i$th observation following Poisson distribution. Then $E(Y_i)=\mu_i=n_ie^{x_i\beta}$. Thus,the log-linear generalized linear model (Poisson regression) is -$$ -\log{\mu_i}=\log{n_i}+x_i\beta \quad Y_i \sim Poisson(\mu_i) -$$ - -where $\log{n_i}$ is an offset term. - -### Estimate and Score - -The function `Analyze_Poisson()` in `{gsm.core}` utilizes `stats::glm()` to generate an estimate of fitted values as well as deviance residual with site-level count data. The p-values are then used as a scoring metric in `{gsm.core}` to flag possible outliers using the thresholds discussed below. - -### Threshold - -By default, sites with deviance residuals exceeding $\pm 5$ or $\pm 7$ from the Poisson analysis are flagged as amber or red, respectively. The thresholds are set based on empirical approach, where we use the distribution of the deviance residuals to find the best separation of the data to identify sites at risk. The default thresholds are set at empirical values based on pilot studies' data. However, they are fully configurable in the package and can be customized and specified in the `{gsm.core}` functions. - - -### Special Situations - -1. Results are not interpretable or it is not appropriate to apply the Poisson method: We don't want to flag in certain situations when results not interpretable or when it is not appropriate to apply the Poisson method due to the small sample sizes. The default threshold for minimum denominator requirement is 30 days exposure at the site level. - - - -### Poisson regression assumptions - -1. **Independence** The responses $y_i$ are independent of each other. - -2. **Count data** The responses $y_i$ are non-negative integer (counts). - -3. **Poisson response** Each $Y_i$ follows the Poisson distribution as noted above with mean and variance equal to $\mu_i$. - -4. **Linearity** $\log{\mu_i}=\log{n_i}+x_i\beta$ where $x_i$ are independent predictors. - -### Assumption checks, constraints and model diagnosis - -1. The assumptions on independence and counted data can be assessed by the knowledge of data collected. - -2. The assumptions on Poisson response can be checked by plotting histogram of the data and comparing empirical mean and variance stratified by the explanatory variable(s). If there is evidence that the assumption of mean=variance is violated, oftentimes we observe variance\>mean. This is called overdispersion. In this case, negative binomial distribution provides an alternative where $Var(Y_i)=\phi E(Y_i)$. - -3. Diagnosis: Goodness of fit test (chi-squared) and deviance residuals. Residuals vs fitted plot. Q-Q plot. - -4. Other considerations: Structural zeros may happen in contrast to random zeros due to sampling from poisson distribution. In this case, a mixture model (zero-inflated Poisson model) may be required. - -### Recommendation - -Use this method when Poisson assumptions hold. From bfdce1c918f16db1b7ac6d8215017c6dd3431c3e Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 14:55:47 -0400 Subject: [PATCH 15/22] don't export functions --- R/attach.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/attach.R b/R/attach.R index d87d774..17026f6 100644 --- a/R/attach.R +++ b/R/attach.R @@ -70,10 +70,6 @@ openrbqm_detach <- function() { #' List all packages imported by openrbqm #' -#' @export -#' -#' @examples -#' #openrbqm_packages() openrbqm_packages <- function() { # get all imports from openrbqm's package description file raw <- utils::packageDescription("openrbqm")$Imports From fc9d3eae339f496125e6e10cc70cca50aba47128 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 15:08:23 -0400 Subject: [PATCH 16/22] maybe version is the issue --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18962dd..2f5c310 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: openrbqm Title: What the Package Does (One Line, Title Case) -Version: 0.0.0.9000 +Version: 0.0.1 Authors@R: c( person("Jeremy", "Wildfire", email = "jwildfire@gmail.com", role = c("aut", "cre")), person("Laura", "Maxwell", email = "lkmaxwell23@gmail.com", role = c("aut")) From e200a82ebd83090a463fb547705d8f93f4cdb344 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 16:00:55 -0400 Subject: [PATCH 17/22] fix deploy --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index e0ece29..48dbc76 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -76,4 +76,4 @@ jobs: run: | git config --local user.email "actions@github.com" git config --local user.name "GitHub Actions" - Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' + Rscript -e 'pkgdown::deploy_to_branch(install = T)' From 8fec7bf08a51e4a5accb0ba37b4f6319083f34a1 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 20 Mar 2025 16:07:16 -0400 Subject: [PATCH 18/22] a couple stray gsm references --- vignettes/articles/ContributorGuidelines.Rmd | 27 ++++++++++---------- vignettes/articles/KRIMethod.Rmd | 2 +- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/vignettes/articles/ContributorGuidelines.Rmd b/vignettes/articles/ContributorGuidelines.Rmd index 449fa40..84b7d78 100644 --- a/vignettes/articles/ContributorGuidelines.Rmd +++ b/vignettes/articles/ContributorGuidelines.Rmd @@ -1,5 +1,6 @@ --- title: "Contributor Guidelines" +description: "This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request." output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Contributor Guidelines} @@ -8,7 +9,7 @@ vignette: > --- ```{r setup, include = FALSE} -library(gsm) +library(gsm.core) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -17,11 +18,11 @@ knitr::opts_chunk$set( # Introduction -This page outlines the development process for `{gsm}`, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request. +This page outlines the development process for `{gsm}` packages, including how to contribute by filing issues, bug reports, and submitting code via a Pull Request. ## Prerequisites -Before contributing code via a Pull Request, make sure to file an [issue](https://github.com/Gilead-BioStats/gsm/issues/new/choose) using one of the pre-specified issue templates. Choose the template that best categorizes what you aim to contribute, which generally can be one of the following: +Before contributing code via a Pull Request, make sure to file an [issue](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) using one of the pre-specified issue templates. Choose the template that best categorizes what you aim to contribute, which generally can be one of the following: - Bugfix Issue: Fix a bug in the code - Feature Issue: Develop a new feature @@ -31,7 +32,7 @@ Someone from the development team will decide if the issue is in scope. If so, t The issue templates provide comments/prompts to help ensure that all relevant information is included. When submitting issues for bug fixes or specific feature requests, it is often helpful to provide a minimal [reprex](https://www.tidyverse.org/help/#reprex), or reproducible example, to help the core developers visualize the issue. -Suggestions or other input that might not warrant formal submission of an issue can be filed under [discussions](https://github.com/Gilead-BioStats/gsm/discussions), which can help facilitate discourse of specific use-cases or requests. +Suggestions or other input that might not warrant formal submission of an issue can be filed under [discussions](https://github.com/Gilead-BioStats/gsm.core/discussions), which can help facilitate discourse of specific use-cases or requests. ## Branches @@ -46,7 +47,7 @@ The core branches that are used in this repository are: All code development takes place in `fix` branches. This section provides general guidance about this process flow. A detailed step-by-step workflow for code development in `fix` branches can be found in the first section of [Appendix 1](#fix-branch-workflow) below. -Once an issue is filed and delegated to a core developer, a `fix` branch will be opened, which is where all package development related to that issue will be conducted. Each `fix` branch should be linked to one or more of the filed GitHub [issue(s)](https://github.com/Gilead-BioStats/gsm/issues). The issue(s) will be referenced in the naming of the `fix` branch. For example, a branch named `fix-111` addresses issue #111. Tasks related to documentation, testing, and/or qualification may also use `fix` branches and associated issues. +Once an issue is filed and delegated to a core developer, a `fix` branch will be opened, which is where all package development related to that issue will be conducted. Each `fix` branch should be linked to one or more of the filed GitHub [issue(s)](https://github.com/Gilead-BioStats/gsm.core/issues). The issue(s) will be referenced in the naming of the `fix` branch. For example, a branch named `fix-111` addresses issue #111. Tasks related to documentation, testing, and/or qualification may also use `fix` branches and associated issues. In addition to the above, please also use the following general guidelines when creating a Pull Request: @@ -78,7 +79,7 @@ Code developers for `{gsm}` use the [tidyverse style guide](https://style.tidyve ## `fix` Branch Workflow {#fix-branch-workflow} 1. Create issue(s) defining addition(s) and/or revision(s): - - Select the appropriate [template](https://github.com/Gilead-BioStats/gsm/issues/new/choose) to use (should be one of the following): + - Select the appropriate [template](https://github.com/Gilead-BioStats/gsm.core/issues/new/choose) to use (should be one of the following): - `Bugfix Issue` - `Feature Issue` - `QC Issue` @@ -98,12 +99,10 @@ Code developers for `{gsm}` use the [tidyverse style guide](https://style.tidyve ## `release` Branch Workflow {#release-branch-workflow} 1. Release Owner creates `release` branch from `dev` branch. - - The `release` branch should be named according to the version of `{gsm}` being released (e.g., `release-v1.2.0`) using [semantic versioning](https://semver.org/). + - The `release` branch should be named according to the version of the package being released (e.g., `release-v1.2.0`) using [semantic versioning](https://semver.org/). - If a release branch is already created, make sure that it is synced with the current `dev` branch. 2. Release Owner prepares the release for QC by performing the following steps and pushing updates to the `release` branch: - Confirm that the version in the `DESCRIPTION` file is up to date. - - **After** the version in the `DESCRIPTION` file is updated, run `gsm::UpdateGSMVersion()` to update metadata that includes the `{gsm}` version number. - - Run `styler` using the script from the [style guide](#style-guide) above (or by running `gutil::style_code()`) and commit any updates. - Update `NEWS.md` with a summary of the revisions/additions in the release. Keep any information from previous releases to maintain traceability through versions. - Ensure that the qualification specifications spreadsheet is up-to-date and accurate. If there have been any changes/updates to qualification tests, reach out to the qualification developer to update any necessary files. @@ -114,12 +113,12 @@ Code developers for `{gsm}` use the [tidyverse style guide](https://style.tidyve - Build site using `pkgdown::build_site()`. Check that all examples are displayed correctly and that all new functions occur on the Reference page. - Open a clean R session. Run `devtools::install()` and then `devtools::check()` locally and confirm that there are no issues/conflicts. 3. Release Owner creates Pull Request from the `release` branch to the `main` branch: - - Use the [release Pull Request template](https://github.com/Gilead-BioStats/gsm/blob/dev/.github/PULL_REQUEST_TEMPLATE/release.md) by adding `?template=release.md` to the URL when creating the Pull Request. The user can also click the link, then click `Raw`, and copy/paste the displayed Markdown into the Pull Request. + - Use the [release Pull Request template](https://github.com/Gilead-BioStats/gsm.core/blob/dev/.github/PULL_REQUEST_TEMPLATE/release.md) by adding `?template=release.md` to the URL when creating the Pull Request. The user can also click the link, then click `Raw`, and copy/paste the displayed Markdown into the Pull Request. - Assign Pull Request to self. - Request QC review(s). - Assign milestone. - Complete Risk Assessments for each Assessment/Feature added as outlined in the Pull Request template. - - Create comments in the Pull Request with a unique [QC checklist](#appendix-2---qc-checklist) for each selected Assessment/Feature (See [example for v0.1.0](https://github.com/Gilead-BioStats/gsm/pull/194)). + - Create comments in the Pull Request with a unique [QC checklist](#appendix-2---qc-checklist) for each selected Assessment/Feature (See [example for v0.1.0](https://github.com/Gilead-BioStats/gsm.core/pull/194)). 4. QC Reviewer(s) conduct(s) review by: - Completing all QC checklists in the Pull Request. - Ensuring all GitHub Actions on the Pull Request to the `main` branch are passing. @@ -200,7 +199,7 @@ This QC checklist is to be used as part of the Development and Release Workflows # Appendix 3 - Continuous Integration with GitHub Actions {#github-action-workflow} -GitHub Actions are used in `{gsm}` to automate processes and ensure all code and documentation is created consistently and documented thoroughly. +GitHub Actions are used in all `{gsm}` packages to automate processes and ensure all code and documentation is created consistently and documented thoroughly. ## Merges to `dev` Branch @@ -211,7 +210,7 @@ GitHub Actions are used in `{gsm}` to automate processes and ensure all code and - Builds Assessment Specification tables from function documentation - Outputs are added to `man`/`.md` and any changes are committed to the compare branch or the triggering Pull Request. - Test Coverage (`test-coverage`): - - Uses `{covr}` to check the package coverage of `{gsm}`. + - Uses `{covr}` to check the package coverage. - Qualification Check (`qualification-check-dev`): - Runs the qualification tests but will not fail if any of the tests do not pass. Developers should review this check when changes that might need updates to qualification are done. @@ -221,6 +220,6 @@ GitHub Actions are used in `{gsm}` to automate processes and ensure all code and - Basic R CMD check which can be run using `rcmdcheck::rcmdcheck()` - Provides an additional check for the ability to build the `pkgdown` reference index and ensure that all functions are documented correctly. The check will also run all qualification tests to ensure that the release is fully qualified. This check will run on `ubuntu-latest` and on R version 4.1.3. Additionally, it will be run on the latest R release version on `windows-latest`, `macOS-latest`, and `ubuntu-latest`. - `pkgdown`: - - Builds the [pkgdown site](https://gilead-biostats.github.io/gsm/) for `{gsm}`. + - Builds the [pkgdown site](https://gilead-biostats.github.io/gsm.core/) for the relevant package (`{gsm.core}` in this case). - Qualification Report (`qualification-report`): - Builds the qualification vignette as an attached artifact to the Pull Request. This should be reviewed by the Pull Request Owner for completeness and correctness to ensure that the artifact added to the release is correct. diff --git a/vignettes/articles/KRIMethod.Rmd b/vignettes/articles/KRIMethod.Rmd index d0c3a05..e2bb25e 100644 --- a/vignettes/articles/KRIMethod.Rmd +++ b/vignettes/articles/KRIMethod.Rmd @@ -8,7 +8,7 @@ vignette: > --- ```{r setup, include = FALSE} -library(gsm) +library(gsm.core) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" From 0caa29a8bdf10701bb34b3bece5c10c892893456 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Fri, 11 Apr 2025 12:11:31 -0400 Subject: [PATCH 19/22] Update README.md --- README.md | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) diff --git a/README.md b/README.md index 2af151d..c2e9613 100644 --- a/README.md +++ b/README.md @@ -10,50 +10,4 @@ Collection of R Packages designed for Risk Based Quality Management 5. gsm.reporting 6. gsm.app 7. gsm.template -8. grail 9. gsm.datasim - - -## Requirements before `{openrbqm}` v1.0.0 release - -### Critical outstanding issues - -#### Overview - -- [ ] `{gsm.reporting}` repo needs to be created and reporting functions and workflows be migrated -- [ ] `{gsm.core}` stripped down after mapping, kri and reporting repo contents confirmed and with at least one release. - - [ ] all examples and most vignettes moved to `{openrbqm}` -- [ ] `{gsm.mapping}` v1.0.0 release candidate with stripped `{gsm}` dev version (or `{gsm.core}` v2.3.0-rc) as dependency -- [ ] `{gsm.kri}` v1.0.0 release candidate with `{gsm.mapping}` v1.0.0-rc and stripped `{gsm.core}` v2.3.0-rc as dependencies -- [ ] `{gsm.endpoints}` mapping edits - - [ ] direct from raw mapping yamls moved to `{gsm.mapping}` with more specific file names - - [ ] update dependencies appropriately - -#### Detailed outstanding issues - -- `{gsm.core}` issues - - [ ] [Feature: Migrate Mapping components to gsm.mapping](https://github.com/Gilead-BioStats/gsm/issues/1972) - - available on gsm branch `fix-1972v2` in this [PR](https://github.com/Gilead-BioStats/gsm/pull/2002) - - [ ] [Feature: Remove gsm.kri functions, workflows and tests](https://github.com/Gilead-BioStats/gsm/issues/2013) - - available on gsm branch `fix-2013` - - [ ] [Feature: Ensure all workflows are migrated to their respective packages](https://github.com/Gilead-BioStats/gsm/issues/2014) - - [ ] [Feature: Migrate reporting vignette to {gsm.reporting}](https://github.com/Gilead-BioStats/gsm/issues/2015) - - [ ] [Feature: Migrate vignettes to {openrbqm} package](https://github.com/Gilead-BioStats/gsm/issues/2016) - - [ ] [Feature: Migrate workflow/reporting qualification tests to {gsm.kri}](https://github.com/Gilead-BioStats/gsm/issues/2017) - - [ ] [QC: remove util- prefix in file names](https://github.com/Gilead-BioStats/gsm/issues/2026) -- `{gsm.mapping}` issues - - [x] [Write a vignette and template for how to request a new domain/variable](https://github.com/Gilead-BioStats/gsm.mapping/issues/4) - - [ ] [Write a vignette displaying all mapped dfs and variables ](https://github.com/Gilead-BioStats/gsm.mapping/issues/3) - - Begun, and merged into dev via PR #18, but needs some editing to fully address the issue -- `{gsm.kri}` issues - - [ ] [Migrate Vignettes and make pkgdown site](https://github.com/Gilead-BioStats/gsm.kri/issues/9) - - begun on gsm branch `fix-9` -- `{gsm.endpoints}` issues - - [ ] [Feature: Move rawplus mapping yamls to the gsm.mapping package](https://github.com/Gilead-BioStats/gsm.endpoints/issues/134) - - [ ] [Feature: Update names and meta information of workflows to be more specific](https://github.com/Gilead-BioStats/gsm.endpoints/issues/133) -- `{gsm.reporting}` issues - -### Nice to haves -- `{gsm.template}` issues -- `{gsm.datasim}` issues - - add `{gsm.endpoints}` domains to available data and vars. [(Issue here)](https://github.com/Gilead-BioStats/gsm.datasim/issues/34) From b0c0302f1fb765a052c2c5da06c3d439eb90b330 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 24 Apr 2025 10:33:33 -0400 Subject: [PATCH 20/22] update description --- DESCRIPTION | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2f5c310..f86cf19 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,13 +23,13 @@ Imports: rstudioapi, utils Remotes: - gsm.core=Gilead-BioStats/gsm.core@main, + gsm.core=Gilead-BioStats/gsm.core@v1.1.0, gsm.app=Gilead-BioStats/gsm.app, - gsm.datasim=Gilead-BioStats/gsm.datasim@fix-37, - gsm.kri=Gilead-BioStats/gsm.kri@main, - gsm.mapping=Gilead-BioStats/gsm.mapping@main, - gsm.reporting=Gilead-BioStats/gsm.reporting@main, - gsm.qc=Gilead-BioStats/gsm.qc@main + gsm.datasim=Gilead-BioStats/gsm.datasim@v1.0.0, + gsm.kri=Gilead-BioStats/gsm.kri@v1.1.0, + gsm.mapping=Gilead-BioStats/gsm.mapping@v1.0.1, + gsm.reporting=Gilead-BioStats/gsm.reporting@v1.0.1, + gsm.qc=Gilead-BioStats/gsm.qc@v1.0.1 Suggests: knitr, rmarkdown, From db87baed969b3ff988098d35c4c003ef9bad615c Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 24 Apr 2025 10:49:54 -0400 Subject: [PATCH 21/22] fix DataAnalysis Vignette --- vignettes/DataAnalysis.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/DataAnalysis.Rmd b/vignettes/DataAnalysis.Rmd index c3bda73..eed5d99 100644 --- a/vignettes/DataAnalysis.Rmd +++ b/vignettes/DataAnalysis.Rmd @@ -56,7 +56,7 @@ dfInput <- Input_Rate( dfNumerator = gsm.core::lSource$Raw_AE, dfDenominator = gsm.core::lSource$Raw_SUBJ, strSubjectCol = "subjid", - strGroupCol = "siteid", + strGroupCol = "invid", strNumeratorMethod = "Count", strDenominatorMethod = "Sum", strDenominatorCol = "timeonstudy" From 129b36c302a1891d82b6754735bfd39fbf837468 Mon Sep 17 00:00:00 2001 From: Laura Maxwell Date: Thu, 24 Apr 2025 11:09:57 -0400 Subject: [PATCH 22/22] update data reporting vignette --- vignettes/DataReporting.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/DataReporting.Rmd b/vignettes/DataReporting.Rmd index 152e3bc..5bbeffb 100644 --- a/vignettes/DataReporting.Rmd +++ b/vignettes/DataReporting.Rmd @@ -68,7 +68,7 @@ To run the analysis workflow on all 13 KRIs using `clindata` Raw+ data, use the 3. `mapped$Mapped_SUBJ` - mapped data.frame of enrolled participants ```{r include = TRUE, message = FALSE} -core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", "PK", "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") lSource <- gsm.core::lSource @@ -298,7 +298,7 @@ Here, you will see how to run your workflows. The general approach is to run the ```{r eval = FALSE, include = TRUE} # Step 1 - Create Mapped Data - filter/map raw data # Source Data -core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", +core_mappings <- c("AE", "COUNTRY", "DATACHG", "DATAENT", "ENROLL", "LB", "PK", "PD", "QUERY", "STUDY", "STUDCOMP", "SDRGCOMP", "SITE", "SUBJ") lSource <- gsm.core::lSource