From 4678d30e805f412e42f94fe322c862358e72c1e3 Mon Sep 17 00:00:00 2001 From: Jon Harmon Date: Sun, 5 Oct 2025 10:31:39 -0500 Subject: [PATCH 1/3] Implement `harmonize_fct()` Closes #3. --- DESCRIPTION | 6 +- NAMESPACE | 3 +- R/aaa-shared_params.R | 9 +++ R/harmonize_fct.R | 50 ++++++++++++++++ R/hrmn-package.R | 1 + R/specify_fct.R | 47 +++++---------- R/zzz.R | 3 - man/dot-apply_fct_lookup.Rd | 23 ++++++++ man/dot-shared_params.Rd | 12 ++++ man/harmonize_fct.Rd | 43 ++++++++++++++ man/specify_fct.Rd | 13 +++-- tests/testthat/test-harmonize_fct.R | 90 +++++++++++++++++++++++++++++ tests/testthat/test-specify_fct.R | 7 +-- 13 files changed, 261 insertions(+), 46 deletions(-) create mode 100644 R/aaa-shared_params.R create mode 100644 R/harmonize_fct.R delete mode 100644 R/zzz.R create mode 100644 man/dot-apply_fct_lookup.Rd create mode 100644 man/dot-shared_params.Rd create mode 100644 man/harmonize_fct.Rd create mode 100644 tests/testthat/test-harmonize_fct.R diff --git a/DESCRIPTION b/DESCRIPTION index a9af153..929c13a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hrmn Title: Harmonize Datasets -Version: 0.0.0.9001 +Version: 0.0.0.9002 Authors@R: person("Jon", "Harmon", , "jonthegeek@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4781-4346")) @@ -22,4 +22,6 @@ Language: en-US Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.3 Imports: - S7 + fastmatch, + rlang, + stbl diff --git a/NAMESPACE b/NAMESPACE index 9ebe023..78f3dd9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,5 @@ # Generated by roxygen2: do not edit by hand +export(harmonize_fct) export(specify_fct) -if (getRversion() < "4.3.0") importFrom("S7", "@") +importFrom(fastmatch,"%fin%") diff --git a/R/aaa-shared_params.R b/R/aaa-shared_params.R new file mode 100644 index 0000000..eabf2a2 --- /dev/null +++ b/R/aaa-shared_params.R @@ -0,0 +1,9 @@ +#' Parameters used in multiple functions +#' +#' Reused parameter definitions are gathered here for easier editing. +#' +#' @param levels (`character`) The allowed values of the factor. +#' +#' @name .shared_params +#' @keywords internal +NULL diff --git a/R/harmonize_fct.R b/R/harmonize_fct.R new file mode 100644 index 0000000..252e8d0 --- /dev/null +++ b/R/harmonize_fct.R @@ -0,0 +1,50 @@ +#' Harmonize a factor +#' +#' @param .data (`character` or coercible to `character`) A vector to harmonize +#' to the specified factor. +#' @inheritParams .shared_params +#' @inheritParams rlang::args_dots_empty +#' @param .spec (`hrmn_spec_fct`) A harmonization specification from +#' [specify_fct()]. +#' @param .lookup (named `character`) A vector of replacement values. The names +#' are the values in `.data` and the values are the target values. +#' +#' @returns A harmonized [factor()]. +#' @export +#' +#' @examples +#' # Without a spec, harmonize_fct() acts like [base::factor()]. +#' harmonize_fct(c("a", "b", "c")) +#' +#' # Basic harmonization, dropping levels not in the spec +#' spec <- specify_fct(levels = c("a", "b")) +#' harmonize_fct(c("a", "b", "c"), .spec = spec) +#' +#' # Using a lookup table to recode values +#' spec2 <- specify_fct(levels = c("fruit", "citrus")) +#' lookup <- c(apple = "fruit", banana = "fruit", orange = "citrus") +#' harmonize_fct( +#' c("apple", "banana", "orange"), +#' .spec = spec2, +#' .lookup = lookup +#' ) +harmonize_fct <- function(.data, ..., .spec = NULL, .lookup = NULL) { + rlang::check_dots_empty() + .data <- stbl::to_chr(.data) + .spec <- .spec %||% specify_fct() + .data <- .apply_fct_lookup(.data, .lookup = .lookup) + return(factor(.data, levels = .spec$levels)) +} + +#' Apply a lookup table to a character vector +#' +#' @inheritParams harmonize_fct +#' @returns A character vector with values replaced according to the lookup +#' table. +#' @keywords internal +.apply_fct_lookup <- function(.data, .lookup = NULL) { + .lookup <- stbl::to_chr(.lookup) + matches <- .data %fin% names(.lookup) + .data[matches] <- .lookup[.data[matches]] + return(.data) +} diff --git a/R/hrmn-package.R b/R/hrmn-package.R index a65cf64..0fff8a5 100644 --- a/R/hrmn-package.R +++ b/R/hrmn-package.R @@ -2,5 +2,6 @@ "_PACKAGE" ## usethis namespace: start +#' @importFrom fastmatch %fin% ## usethis namespace: end NULL diff --git a/R/specify_fct.R b/R/specify_fct.R index 02562ad..293aa14 100644 --- a/R/specify_fct.R +++ b/R/specify_fct.R @@ -1,34 +1,19 @@ -# Developer note: `specify_fct()` is designed to create a data-less -# "specification" object. It defines the target state (the levels) for a factor -# but doesn't hold any actual factor data itself. This is why the constructor -# internally provides `integer()` as the data component to `S7::new_object()`. -# -# In the future, we might use the `hrmn_fct` class to represent actual, -# harmonized factor data. In that scenario, we would likely create a separate -# `class_hrmn_fct` object and have `specify_fct()` be a wrapper function that -# calls the constructor with the empty data. For now, since we don't need the -# full factor-like class, we are directly defining the `hrmn_fct` class in -# `specify_fct()` - -#' Specify a factor harmonization +#' Factor specification #' -#' Create a `hrmn_fct` object that specifies the desired levels for a factor -#' variable. This 'specification' object does not contain any data itself, only -#' the rules for harmonization. +#' Create an object that specifies the desired levels for a factor variable. +#' This specification object does not contain any data itself, only the rules +#' for harmonization. #' -#' @param levels (`character`) The allowed values of the factor. -#' @returns A factor specification, an S7 object of class `hrmn::hrmn_fct`. +#' @inheritParams .shared_params +#' +#' @returns A `hrmn_fct_spec` object that acts as a specification. #' @export -specify_fct <- S7::new_class( - "hrmn_fct", - parent = S7::class_factor, - properties = list( - levels = S7::class_character - ), - constructor = function(levels = character()) { - S7::new_object( - integer(), - levels = levels - ) - } -) +#' +#' @examples +#' specify_fct(levels = c("a", "b", "c")) +specify_fct <- function(levels = character()) { + structure( + list(levels = stbl::to_chr(levels)), + class = c("hrmn_fct_spec", "hrmn_spec", "list") + ) +} diff --git a/R/zzz.R b/R/zzz.R deleted file mode 100644 index 2b31dbb..0000000 --- a/R/zzz.R +++ /dev/null @@ -1,3 +0,0 @@ -# enable usage of @name in package code -#' @rawNamespace if (getRversion() < "4.3.0") importFrom("S7", "@") -NULL diff --git a/man/dot-apply_fct_lookup.Rd b/man/dot-apply_fct_lookup.Rd new file mode 100644 index 0000000..e2dfb44 --- /dev/null +++ b/man/dot-apply_fct_lookup.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/harmonize_fct.R +\name{.apply_fct_lookup} +\alias{.apply_fct_lookup} +\title{Apply a lookup table to a character vector} +\usage{ +.apply_fct_lookup(.data, .lookup = NULL) +} +\arguments{ +\item{.data}{(\code{character} or coercible to \code{character}) A vector to harmonize +to the specified factor.} + +\item{.lookup}{(named \code{character}) A vector of replacement values. The names +are the values in \code{.data} and the values are the target values.} +} +\value{ +A character vector with values replaced according to the lookup +table. +} +\description{ +Apply a lookup table to a character vector +} +\keyword{internal} diff --git a/man/dot-shared_params.Rd b/man/dot-shared_params.Rd new file mode 100644 index 0000000..8eb0f4b --- /dev/null +++ b/man/dot-shared_params.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aaa-shared_params.R +\name{.shared_params} +\alias{.shared_params} +\title{Parameters used in multiple functions} +\arguments{ +\item{levels}{(\code{character}) The allowed values of the factor.} +} +\description{ +Reused parameter definitions are gathered here for easier editing. +} +\keyword{internal} diff --git a/man/harmonize_fct.Rd b/man/harmonize_fct.Rd new file mode 100644 index 0000000..a5f8c13 --- /dev/null +++ b/man/harmonize_fct.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/harmonize_fct.R +\name{harmonize_fct} +\alias{harmonize_fct} +\title{Harmonize a factor} +\usage{ +harmonize_fct(.data, ..., .spec = NULL, .lookup = NULL) +} +\arguments{ +\item{.data}{(\code{character} or coercible to \code{character}) A vector to harmonize +to the specified factor.} + +\item{...}{These dots are for future extensions and must be empty.} + +\item{.spec}{(\code{hrmn_spec_fct}) A harmonization specification from +\code{\link[=specify_fct]{specify_fct()}}.} + +\item{.lookup}{(named \code{character}) A vector of replacement values. The names +are the values in \code{.data} and the values are the target values.} +} +\value{ +A harmonized \code{\link[=factor]{factor()}}. +} +\description{ +Harmonize a factor +} +\examples{ +# Without a spec, harmonize_fct() acts like [base::factor()]. +harmonize_fct(c("a", "b", "c")) + +# Basic harmonization, dropping levels not in the spec +spec <- specify_fct(levels = c("a", "b")) +harmonize_fct(c("a", "b", "c"), .spec = spec) + +# Using a lookup table to recode values +spec2 <- specify_fct(levels = c("fruit", "citrus")) +lookup <- c(apple = "fruit", banana = "fruit", orange = "citrus") +harmonize_fct( + c("apple", "banana", "orange"), + .spec = spec2, + .lookup = lookup +) +} diff --git a/man/specify_fct.Rd b/man/specify_fct.Rd index f460874..ff3a34e 100644 --- a/man/specify_fct.Rd +++ b/man/specify_fct.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/specify_fct.R \name{specify_fct} \alias{specify_fct} -\title{Specify a factor harmonization} +\title{Factor specification} \usage{ specify_fct(levels = character()) } @@ -10,10 +10,13 @@ specify_fct(levels = character()) \item{levels}{(\code{character}) The allowed values of the factor.} } \value{ -A factor specification, an S7 object of class \code{hrmn::hrmn_fct}. +A \code{hrmn_fct_spec} object that acts as a specification. } \description{ -Create a \code{hrmn_fct} object that specifies the desired levels for a factor -variable. This 'specification' object does not contain any data itself, only -the rules for harmonization. +Create an object that specifies the desired levels for a factor variable. +This specification object does not contain any data itself, only the rules +for harmonization. +} +\examples{ +specify_fct(levels = c("a", "b", "c")) } diff --git a/tests/testthat/test-harmonize_fct.R b/tests/testthat/test-harmonize_fct.R new file mode 100644 index 0000000..a99642c --- /dev/null +++ b/tests/testthat/test-harmonize_fct.R @@ -0,0 +1,90 @@ +test_that("harmonize_fct() works with empty vector and returns a factor", { + expect_identical( + { + harmonize_fct(factor()) + }, + factor() + ) +}) + +test_that("harmonize_fct() drops unspecified levels", { + expect_identical( + { + harmonize_fct(factor(c("a", "b")), .spec = specify_fct(levels = "a")) + }, + factor(c("a", NA), levels = "a") + ) +}) + + +test_that("harmonize_fct() errors if .spec is not named", { + expect_error( + { + harmonize_fct(factor(c("a", "b")), specify_fct(levels = "a")) + }, + class = "rlib_error_dots_nonempty" + ) +}) + +test_that("The first `harmonize_fct()` argument is `.data`", { + expect_equal( + rlang::fn_fmls_names(harmonize_fct)[1], + ".data" + ) +}) + +test_that("harmonize_fct() preserves existing NAs", { + expect_equal( + { + harmonize_fct(factor(c("a", "b", NA)), .spec = specify_fct(levels = "a")) + }, + factor(c("a", NA, NA), levels = "a") + ) +}) + +test_that("harmonize_fct() works with character vectors", { + expect_equal( + { + harmonize_fct(c("a", "b"), .spec = specify_fct(levels = "a")) + }, + factor(c("a", NA), levels = "a") + ) +}) + +test_that("harmonize_fct() works with an empty spec", { + expect_equal( + { + harmonize_fct( + factor(c("a", "b")), + .spec = specify_fct(levels = character()) + ) + }, + factor(c(NA, NA), levels = character()) + ) +}) + +test_that("harmonize_fct() uses .lookup table", { + expect_equal( + { + harmonize_fct( + c("x", "y", "z"), + .spec = specify_fct(levels = c("a", "b")), + .lookup = c(x = "a", y = "a", z = "b") + ) + }, + factor(c("a", "a", "b"), levels = c("a", "b")) + ) +}) + +test_that("harmonize_fct() .lookup values not in levels become NA", { + expect_equal( + { + harmonize_fct( + "x", + .spec = specify_fct(levels = "a"), + .lookup = c(x = "b") + ) + }, + factor(NA_character_, levels = "a") + ) +}) diff --git a/tests/testthat/test-specify_fct.R b/tests/testthat/test-specify_fct.R index f966f88..9b93283 100644 --- a/tests/testthat/test-specify_fct.R +++ b/tests/testthat/test-specify_fct.R @@ -1,8 +1,7 @@ test_that("specify_fct() returns an object with the correct class", { - spec <- specify_fct() expect_s3_class( - spec, - c("hrmn::hrmn_fct", "factor", "S7_object"), + specify_fct(), + c("hrmn_fct_spec", "hrmn_spec", "list"), exact = TRUE ) }) @@ -10,5 +9,5 @@ test_that("specify_fct() returns an object with the correct class", { test_that("specify_fct() stores the levels", { lvls <- c("a", "b", "c") spec <- specify_fct(levels = lvls) - expect_equal(spec@levels, lvls) + expect_equal(spec$levels, lvls) }) From d4a8f96ca5b7643aa946bee015c117206af4f7f5 Mon Sep 17 00:00:00 2001 From: Jon Harmon Date: Mon, 6 Oct 2025 08:23:44 -0500 Subject: [PATCH 2/3] Let rlang deal with `%||%` --- NAMESPACE | 1 + R/hrmn-package.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 78f3dd9..04627fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,3 +3,4 @@ export(harmonize_fct) export(specify_fct) importFrom(fastmatch,"%fin%") +importFrom(rlang,"%||%") diff --git a/R/hrmn-package.R b/R/hrmn-package.R index 0fff8a5..860748b 100644 --- a/R/hrmn-package.R +++ b/R/hrmn-package.R @@ -3,5 +3,6 @@ ## usethis namespace: start #' @importFrom fastmatch %fin% +#' @importFrom rlang %||% ## usethis namespace: end NULL From 2a375ac110f31a77e86218145d25e69339ea7468 Mon Sep 17 00:00:00 2001 From: Jon Harmon Date: Mon, 6 Oct 2025 08:25:17 -0500 Subject: [PATCH 3/3] Typo --- R/harmonize_fct.R | 2 +- man/harmonize_fct.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/harmonize_fct.R b/R/harmonize_fct.R index 252e8d0..9e1f9c7 100644 --- a/R/harmonize_fct.R +++ b/R/harmonize_fct.R @@ -4,7 +4,7 @@ #' to the specified factor. #' @inheritParams .shared_params #' @inheritParams rlang::args_dots_empty -#' @param .spec (`hrmn_spec_fct`) A harmonization specification from +#' @param .spec (`hrmn_fct_spec`) A harmonization specification from #' [specify_fct()]. #' @param .lookup (named `character`) A vector of replacement values. The names #' are the values in `.data` and the values are the target values. diff --git a/man/harmonize_fct.Rd b/man/harmonize_fct.Rd index a5f8c13..bb55d28 100644 --- a/man/harmonize_fct.Rd +++ b/man/harmonize_fct.Rd @@ -12,7 +12,7 @@ to the specified factor.} \item{...}{These dots are for future extensions and must be empty.} -\item{.spec}{(\code{hrmn_spec_fct}) A harmonization specification from +\item{.spec}{(\code{hrmn_fct_spec}) A harmonization specification from \code{\link[=specify_fct]{specify_fct()}}.} \item{.lookup}{(named \code{character}) A vector of replacement values. The names