diff --git a/DESCRIPTION b/DESCRIPTION index 08cdde2..083fc22 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: datawrap Title: Final Steps for Dataset Preparation -Version: 0.0.0.9000 +Version: 0.0.0.9001 Authors@R: person("Jon", "Harmon", , "jonthegeek@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4781-4346")) @@ -10,10 +10,13 @@ License: MIT + file LICENSE URL: https://wranglezone.github.io/datawrap/, https://github.com/wranglezone/datawrap BugReports: https://github.com/wranglezone/datawrap/issues +Depends: + R (>= 4.1) Imports: knitr, purrr, rlang, + stbl, stringr, tibble, vctrs @@ -21,6 +24,8 @@ Suggests: testthat (>= 3.0.0), usethis, withr +Remotes: + stbl=wranglezone/stbl Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US diff --git a/NAMESPACE b/NAMESPACE index 385e453..3437290 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,4 +2,5 @@ export(create_dataset_dictionary) export(describe_dataset) +export(finalize_integers) export(write_dataset_dictionary) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..729911f --- /dev/null +++ b/NEWS.md @@ -0,0 +1,7 @@ +# datawrap (development version) + +* `finalize_integers()` converts integerish columns to integer for efficient storage (@copilot & @jonthegeek, #10). + +# datawrap 0.0.0.9000 + +* Initial release. diff --git a/R/finalize_integers.R b/R/finalize_integers.R new file mode 100644 index 0000000..0d2940c --- /dev/null +++ b/R/finalize_integers.R @@ -0,0 +1,33 @@ +#' Downcast integerish columns to integer +#' +#' Iterates over all columns (or list elements) in `dataset` and converts any +#' non-logical column whose values can all be represented as integers without +#' losing any information to integers. Columns that contain non-integerish data +#' are left unchanged. +#' +#' @param dataset (`data.frame`, `list`, or `NULL`) The dataset to process. +#' +#' @returns The `dataset` with all integerish columns converted to integer (or +#' `NULL` if `dataset` is `NULL`). +#' @export +#' +#' @examples +#' df <- data.frame(x = c(1.0, 2.0, 3.0), y = c(1.1, 2.2, 3.3)) +#' finalize_integers(df) +finalize_integers <- function(dataset) { + if (!is.null(dataset) && !is.list(dataset)) { + stbl::pkg_abort( + "datawrap", + "{.arg dataset} must be a {.cls data.frame}, {.cls list}, or {.cls NULL}.", + c("invalid_dataset", "invalid_argument") + ) + } + # NOTE: This function alone does not warrant adding dplyr to imports, but it + # could be refactored to use dplyr::mutate, dplyr::across, and dplyr::where + # if dplyr is added as a dependency elsewhere in the package. + int_ish_cols <- purrr::map_lgl(dataset, \(x) { + !is.logical(x) && stbl::is_int_ish(x) + }) + dataset[int_ish_cols] <- purrr::map(dataset[int_ish_cols], stbl::to_int) + dataset +} diff --git a/man/datawrap-package.Rd b/man/datawrap-package.Rd index 82d7860..533de11 100644 --- a/man/datawrap-package.Rd +++ b/man/datawrap-package.Rd @@ -11,8 +11,8 @@ Helpers to create data dictionaries, document package datasets, and apply finish \seealso{ Useful links: \itemize{ - \item \url{https://github.com/wranglezone/datawrap} \item \url{https://wranglezone.github.io/datawrap/} + \item \url{https://github.com/wranglezone/datawrap} \item Report bugs at \url{https://github.com/wranglezone/datawrap/issues} } diff --git a/man/finalize_integers.Rd b/man/finalize_integers.Rd new file mode 100644 index 0000000..c34f33a --- /dev/null +++ b/man/finalize_integers.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/finalize_integers.R +\name{finalize_integers} +\alias{finalize_integers} +\title{Downcast integerish columns to integer} +\usage{ +finalize_integers(dataset) +} +\arguments{ +\item{dataset}{(\code{data.frame}, \code{list}, or \code{NULL}) The dataset to process.} +} +\value{ +The \code{dataset} with all integerish columns converted to integer (or +\code{NULL} if \code{dataset} is \code{NULL}). +} +\description{ +Iterates over all columns (or list elements) in \code{dataset} and converts any +non-logical column whose values can all be represented as integers without +losing any information to integers. Columns that contain non-integerish data +are left unchanged. +} +\examples{ +df <- data.frame(x = c(1.0, 2.0, 3.0), y = c(1.1, 2.2, 3.3)) +finalize_integers(df) +} diff --git a/tests/testthat/test-finalize_integers.R b/tests/testthat/test-finalize_integers.R new file mode 100644 index 0000000..2f4e4ad --- /dev/null +++ b/tests/testthat/test-finalize_integers.R @@ -0,0 +1,68 @@ +test_that("finalize_integers() converts integerish double columns to integer (#10)", { + df <- data.frame(x = c(1.0, 2.0, 3.0), y = c(1.1, 2.2, 3.3)) + result <- finalize_integers(df) + expect_type(result$x, "integer") + expect_type(result$y, "double") +}) + +test_that("finalize_integers() converts integer-valued double elements in a list to integer (#10)", { + lst <- list(x = c(1.0, 2.0, 3.0), y = c(1.1, 2.2, 3.3)) + result <- finalize_integers(lst) + expect_type(result$x, "integer") + expect_type(result$y, "double") +}) + +test_that("finalize_integers() converts integerish character columns to integer (#10)", { + df <- data.frame(x = as.character(1:3), y = as.character(c(1.1, 2.2, 3.3))) + result <- finalize_integers(df) + expect_type(result$x, "integer") + expect_type(result$y, "character") +}) + +test_that("finalize_integers() leaves non-integerish columns unchanged (#10)", { + df <- data.frame( + x = c(1.0, 2.0), + y = letters[1:2], + z = TRUE, + stringsAsFactors = FALSE + ) + result <- finalize_integers(df) + expect_type(result$y, "character") + expect_type(result$z, "logical") +}) + +test_that("finalize_integers() preserves NA values in converted columns (#10)", { + df <- data.frame(x = c(1.0, NA, 3.0)) + result <- finalize_integers(df) + expect_type(result$x, "integer") + expect_true(is.na(result$x[[2]])) +}) + +test_that("finalize_integers() preserves the class of the input (#10)", { + tbl <- tibble::tibble(x = c(1.0, 2.0)) + result <- finalize_integers(tbl) + expect_identical(class(result), class(tbl)) +}) + +test_that("finalize_integers() errors if dataset is not a data.frame, list, or NULL (#10)", { + stbl::expect_pkg_error_classes( + finalize_integers("not a dataset"), + "datawrap", + "invalid_dataset", + "invalid_argument" + ) +}) + +test_that("finalize_integers() handles a dataset with no double columns (#10)", { + df <- data.frame(x = 1L, y = "a", stringsAsFactors = FALSE) + expect_identical(finalize_integers(df), df) +}) + +test_that("finalize_integers() handles an empty data frame (#10)", { + df <- data.frame() + expect_identical(finalize_integers(df), df) +}) + +test_that("finalize_integers() handles NULL (#10)", { + expect_null(finalize_integers(NULL)) +})