Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
^_pkgdown\.yml$
^docs$
^pkgdown$
^principles\.md$
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hrmn
Title: Harmonize Datasets
Version: 0.0.0.9004
Version: 0.0.0.9005
Authors@R:
person("Jon", "Harmon", , "jonthegeek@gmail.com", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4781-4346"))
Expand All @@ -17,7 +17,8 @@ Depends:
Imports:
fastmatch,
rlang,
stbl (>= 0.2.0.9002)
stbl (>= 0.2.0.9002),
tibble
Suggests:
testthat (>= 3.0.0)
Remotes:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(caller_arg)
export(caller_env)
export(harmonize_df)
export(harmonize_fct)
export(specify_df)
export(specify_fct)
Expand Down
18 changes: 8 additions & 10 deletions R/aaa-conditions.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ rlang::caller_env
.hrmn_abort <- function(
message,
subclass,
call = caller_env(),
message_env = call,
.call = caller_env(),
message_env = .call,
parent = NULL,
...
) {
stbl::pkg_abort(
"hrmn",
message = message,
subclass = subclass,
call = call,
call = .call,
message_env = message_env,
parent = parent,
...
Expand All @@ -37,15 +37,14 @@ rlang::caller_env
#' @inheritParams .shared_params
#' @returns `NULL`, invisibly.
#' @keywords internal
.check_args_named <- function(..., call = rlang::caller_env()) {
.stop_if_args_unnamed <- function(..., .call = rlang::caller_env()) {
if (...length() && (is.null(...names()) || !all(nzchar(...names())))) {
.hrmn_abort(
"All arguments must be named.",
"args_unnamed",
call = call
.call = .call
)
}
invisible(NULL)
}

#' Check that all args are hrmn_spec objects
Expand All @@ -54,7 +53,7 @@ rlang::caller_env
#' @inheritParams .shared_params
#' @returns `NULL`, invisibly.
#' @keywords internal
.check_args_spec <- function(..., call = rlang::caller_env()) {
.stop_if_args_not_spec <- function(..., .call = rlang::caller_env()) {
dots <- list(...)
is_spec <- vapply(dots, inherits, logical(1), "hrmn_spec")
if (length(dots) && !all(is_spec)) {
Expand All @@ -65,10 +64,9 @@ rlang::caller_env
"All arguments must be `hrmn_spec` objects.",
"x" = "Argument{?s} {.arg {bad_args}} {?is/are} not {?a / }`hrmn_spec` object{?s}."
),
subclass = "args_not_spec",
call = call,
subclass = "not_spec",
.call = .call,
message_env = rlang::current_env()
)
}
invisible(NULL)
}
8 changes: 1 addition & 7 deletions R/aaa-shared_params.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,9 @@
#'
#' Reused parameter definitions are gathered here for easier editing.
#'
#' @param call `(environment)` The execution environment to mention as the
#' @param .call (`environment`) The execution environment to mention as the
#' source of error messages.
#' @param levels (`character`) The allowed values of the factor.
#' @param message_env (`environment`) The execution environment to use to
#' evaluate variables in error messages.
#' @param parent A parent condition, as you might create during a
#' [rlang::try_fetch()]. See [rlang::abort()] for additional information.
#' @param subclass (`character`) Class(es) to assign to the error. Will be
#' prefixed by "hrmn-error-".
#' @name .shared_params
#' @keywords internal
NULL
238 changes: 238 additions & 0 deletions R/harmonize_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#' Harmonize a data frame
#'
#' @param .data (`data.frame`) A data frame to harmonize.
#' @param .spec (`hrmn_spec_df`) A data frame harmonization specification.
#' @param .unspecified_columns (`"error"`, `"drop"`, or `"keep"`) How to handle
#' columns in `.data` that are not present in `.spec`.
#' @inheritParams rlang::args_dots_empty
#'
#' @returns The input `.data` harmonized to a [tibble::tibble()].
#' @family harmonization functions
#' @examples
#' df <- data.frame(
#' size = c("Small", "Medium", "S", "M", "Large", "Lrg", "Sm"),
#' id = 1:7
#' )
#'
#' # This spec will coerce values to NA if they are not "Small", "Medium",
#' # or "Large".
#' spec <- specify_df(
#' size = specify_fct(levels = c("Small", "Medium", "Large"))
#' )
#'
#' # We can provide harmonization rules to the data before the spec is applied.
#' # Here, we harmonize the input factor to convert "S", "M", "Sm", and "Lrg" to
#' # valid values.
#' harmonize_df(
#' df,
#' size = harmonize_fct(
#' size,
#' .lookup = c("S" = "Small", "M" = "Medium", "Sm" = "Small", "Lrg" = "Large")
#' ),
#' .spec = spec,
#' .unspecified_columns = "keep"
#' )
#' @export
harmonize_df <- function(
.data,
...,
.spec = NULL,
.unspecified_columns = c("error", "drop", "keep")
) {
.spec <- .to_hrmn_spec(.spec, "df")
.data <- .harmonize_df_data(
.data,
...,
.spec = .spec,
.unspecified_columns = rlang::arg_match(.unspecified_columns)
)
return(.harmonize_df_spec(.data, .spec = .spec))
}

#' Harmonize a data frame based on data inputs
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns A `data.frame` with custom harmonizations and column name
#' reconciliation applied.
#' @keywords internal
.harmonize_df_data <- function(
.data,
...,
.spec,
.unspecified_columns,
.call = rlang::caller_env()
) {
.harmonize_col_names(
.harmonize_df_dots(.data, ..., .call = .call),
.spec,
.unspecified_columns,
.call = .call
)
}

#' Harmonize a data frame based on a harmonization specification
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns The input `.data` harmonized to a [tibble::tibble()].
#' @keywords internal
.harmonize_df_spec <- function(
.data,
.spec,
.call = rlang::caller_env()
) {
.data_lst <- rlang::set_names(vector("list", length(.spec)), names(.spec))
for (col_name in names(.spec)) {
.data_lst[[col_name]] <- .harmonize_col(
.data,
.spec,
col_name,
.call = .call
)
}
.data_lst <- c(.data_lst, .data[setdiff(names(.data), names(.spec))])
return(tibble::as_tibble(.data_lst))
}

#' Harmonize a data frame based on custom harmonization calls
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns A `data.frame` with custom harmonizations applied.
#' @keywords internal
.harmonize_df_dots <- function(
.data,
...,
.call = rlang::caller_env()
) {
if (!...length()) {
return(.data)
}
dots <- rlang::enquos(..., .named = TRUE)
for (col_name in names(dots)) {
.data[[col_name]] <- rlang::eval_tidy(dots[[col_name]], data = .data)
}
return(.data)
}

#' Harmonize data frame column names against a specification
#'
#' This will almost definitely migrate to stbl in the future.
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns A `data.frame`, possibly with columns removed.
#' @keywords internal
.harmonize_col_names <- function(
.data,
.spec,
.unspecified_columns,
.call = rlang::caller_env()
) {
.stop_if_missing_col_names(.data, .spec, .call = .call)
.harmonize_extra_col_names(.data, .spec, .unspecified_columns, .call = .call)
}

#' Stop if data frame is missing columns from specification
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns `NULL` (invisibly)
#' @keywords internal
.stop_if_missing_col_names <- function(
.data,
.spec,
.call = rlang::caller_env()
) {
missing_from_data <- setdiff(names(.spec), names(.data))
if (length(missing_from_data)) {
.hrmn_abort(
c(
"The data frame is missing columns required by the specification.",
i = "Missing columns: {missing_from_data}."
),
"col_mismatch",
.call = .call,
message_env = rlang::current_env()
)
}
}

#' Harmonize extra data frame column names not in specification
#'
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @returns A `data.frame`, possibly with columns removed.
#' @keywords internal
.harmonize_extra_col_names <- function(
.data,
.spec,
.unspecified_columns,
.call = rlang::caller_env()
) {
extra_in_data <- setdiff(names(.data), names(.spec))
if (length(extra_in_data)) {
.data <- switch(
.unspecified_columns,
error = .hrmn_abort(
c(
"The data frame has columns not present in the specification.",
i = "Extra columns: {extra_in_data}.",
i = "Set {.arg .unspecified_columns} to {.str drop} to remove extra columns.",
i = "Set {.arg .unspecified_columns} to {.str keep} to keep extra columns."
),
"col_mismatch",
.call = .call,
message_env = rlang::current_env()
),
drop = .data[names(.spec)],
keep = .data
)
}
return(.data)
}

#' Harmonize a specific column within a data.frame
#'
#' @param .col_name (`length-1 character`) The name of the column to harmonize
#' within `.data`.
#' @inheritParams harmonize_df
#' @inheritParams .shared_params
#' @inherit harmonize_df return
#' @keywords internal
.harmonize_col <- function(
.data,
.spec,
.col_name,
.call = rlang::caller_env()
) {
this_spec <- .spec[[.col_name]]
if (inherits(this_spec, "hrmn_spec_fct")) {
return(harmonize_fct(.data[[.col_name]], .spec = this_spec))
} else {
.stop_col_has_unknown_spec(.col_name, class(this_spec), .call = .call)
}
}

#' Stop if a column has an unknown specification class
#'
#' @param .col_name (`length-1 character`) The name of the column.
#' @param class (`character`) The classes of the column specification.
#' @inheritParams .shared_params
#' @keywords internal
.stop_col_has_unknown_spec <- function(
.col_name,
class,
.call = rlang::caller_env()
) {
.hrmn_abort(
c(
"Column specification must be created with a known `specify_*()` function.",
i = "Column {(.col_name)} has specification type {.cls {class[[1]]}}."
),
"unknown_spec",
.call = .call,
message_env = rlang::current_env()
)
}
11 changes: 7 additions & 4 deletions R/harmonize_fct.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@
harmonize_fct <- function(.data, ..., .spec = NULL, .lookup = NULL) {
rlang::check_dots_empty()
.data <- stbl::to_chr(.data)
.spec <- .spec %||% specify_fct()
.data <- .apply_fct_lookup(.data, .lookup = .lookup)
return(factor(.data, levels = .spec$levels))
.spec <- .to_hrmn_spec(.spec, "fct")
.data <- .harmonize_fct_by_lookup(.data, .lookup = .lookup)
if (length(.spec)) {
return(factor(.data, levels = .spec$levels))
}
return(factor(.data))
}

#' Apply a lookup table to a character vector
Expand All @@ -42,7 +45,7 @@ harmonize_fct <- function(.data, ..., .spec = NULL, .lookup = NULL) {
#' @returns A character vector with values replaced according to the lookup
#' table.
#' @keywords internal
.apply_fct_lookup <- function(.data, .lookup = NULL) {
.harmonize_fct_by_lookup <- function(.data, .lookup = NULL) {
.lookup <- stbl::to_chr(.lookup)
matches <- .data %fin% names(.lookup)
.data[matches] <- .lookup[.data[matches]]
Expand Down
4 changes: 2 additions & 2 deletions R/specify_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' )
#' @export
specify_df <- function(...) {
.check_args_named(...)
.check_args_spec(...)
.stop_if_args_unnamed(...)
.stop_if_args_not_spec(...)
structure(list(...), class = c("hrmn_spec_df", "hrmn_spec", "list"))
}
Loading