From 2c9568640e8a78ceae7b94fe37f5ed4a28ba9707 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Thu, 14 Apr 2016 17:22:39 -0500 Subject: [PATCH 1/9] Additional fixes for legacy mungebits. --- CHANGELOG.md | 2 +- DESCRIPTION | 2 +- R/munge.R | 21 +++++++++++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc46891..a6c1c54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -### Version 0.1.0.9004-6 +### Version 0.1.0.9004-7 * Provide backwards compatibility with [legacy mungebits](https://github.com/robertzk/mungebits) diff --git a/DESCRIPTION b/DESCRIPTION index 48369d2..e0197b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Description: A way of thinking about data preparation that online prediction so that both can be described by the same codebase. With mungebits, you can save time on having to re-implement your R code to work in production and instead re-use the same codebase. -Version: 0.1.0.9006 +Version: 0.1.0.9007 Author: Robert Krzyzanowski Maintainer: Robert Krzyzanowski Authors@R: c(person("Robert", "Krzyzanowski", diff --git a/R/munge.R b/R/munge.R index a7d43ef..473edcd 100644 --- a/R/munge.R +++ b/R/munge.R @@ -247,7 +247,9 @@ munge <- function(data, mungelist, stagerunner = FALSE, list = FALSE, parse = TRUE) { stopifnot(is.data.frame(data) || (is.environment(data) && - (!identical(stagerunner, FALSE) || any(ls(data) == "data")))) + ## We have to be slightly careful here to ensure that we handle + ## [objectdiff](https://github.com/robertzk/objectdiff) environments correctly. + (!identical(stagerunner, FALSE) || environment_has_data(data)))) if (length(mungelist) == 0L) { return(data) @@ -376,7 +378,9 @@ mungepiece_stage_body <- function() { ## the trained mungepiece. # Make a fresh copy to avoid shared stage problems. piece <- mungepieces[[mungepiece_index]]$duplicate(private = TRUE) - piece$run(env) + ## We don't do the straightforward `piece$run(env)` to ensure + ## compatibility with [tracked environments](https://github.com/robertzk/objectdiff). + env$data <- piece$run(env$data) newpieces[[mungepiece_index]] <<- piece ## When we are out of mungepieces, that is, when the current index equals @@ -386,6 +390,7 @@ mungepiece_stage_body <- function() { ## the munging actions on new data by passing the dataframe as the second ## argument to the `munge` function. if (mungepiece_index == size) { + names(newpieces) <- names(mungepieces) attr(env$data, "mungepieces") <- append(attr(env$data, "mungepieces"), newpieces) } @@ -418,9 +423,21 @@ legacy_mungepiece_stage_body <- function() { piece$run(env) if (mungepiece_index == size) { + names(newpieces) <- names(mungepieces) attr(env$data, "mungepieces") <- append(attr(env$data, "mungepieces"), newpieces) } }) } +environment_has_data <- function(env) { + ## For compatibility with [objectdiff](https://github.com/robertzk/objectdiff), + ## we use its special-purpose `ls`. + if (is(env, "tracked_environment") && + is.element("objectdiff", loadedNamespaces())) { + any(getFromNamespace("ls", "objectdiff")(data) == "data") + } else { + any(ls(data) == "data") + } +} + From 6ec05588f154f74ce95090b86d02ac2755289eea Mon Sep 17 00:00:00 2001 From: RobertZK Date: Sun, 17 Apr 2016 13:45:35 -0500 Subject: [PATCH 2/9] Check for legacy mungebits in parse_mungepiece. --- R/munge.R | 2 +- R/mungepiece.R | 2 +- R/parse_mungepiece.R | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/munge.R b/R/munge.R index 473edcd..6efe513 100644 --- a/R/munge.R +++ b/R/munge.R @@ -437,7 +437,7 @@ environment_has_data <- function(env) { is.element("objectdiff", loadedNamespaces())) { any(getFromNamespace("ls", "objectdiff")(data) == "data") } else { - any(ls(data) == "data") + any(ls(env) == "data") } } diff --git a/R/mungepiece.R b/R/mungepiece.R index 787a618..eba6453 100644 --- a/R/mungepiece.R +++ b/R/mungepiece.R @@ -68,7 +68,7 @@ mungepiece <- R6::R6Class("mungepiece", duplicate_mungepiece <- function(piece, ...) { ## To ensure backwards compatibility with ## [legacy mungebits](https://github.com/robertzk/mungebits), - ## we perform nothing is the piece is not an R6 object (and hence + ## we perform nothing if the piece is not an R6 object (and hence ## a new mungepiece in the style of this package). if (is.legacy_mungepiece(piece)) { piece diff --git a/R/parse_mungepiece.R b/R/parse_mungepiece.R index 346cc39..9cad0b7 100644 --- a/R/parse_mungepiece.R +++ b/R/parse_mungepiece.R @@ -243,7 +243,11 @@ parse_mungepiece <- function(args) { } else if (length(args) == 1L && is.mungebit(args[[1L]])) { ## This case is technically handled already in parse_mungepiece_single, ## but we make it explicit here. - mungepiece$new(duplicate_mungebit(args[[1L]])) + if (is.legacy_mungebit(args[[1L]])) { + getFromNamespace("mungepiece", "mungebits")$new(args[[1L]]) + } else { + mungepiece$new(duplicate_mungebit(args[[1L]])) + } ## The third permissible format requires no unnamed arguments, since it ## must be a list consisting of a "train" and "predict" key. } else if (is.list(args) && length(args) > 0L) { From fc33b1b8e05c949341854be006f69870c9c1dfa7 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Sun, 17 Apr 2016 14:17:48 -0500 Subject: [PATCH 3/9] Add failing test for third munge format using legacy mungebits. --- tests/testthat/test-legacy.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/testthat/test-legacy.R b/tests/testthat/test-legacy.R index 4e55a0c..89a4c30 100644 --- a/tests/testthat/test-legacy.R +++ b/tests/testthat/test-legacy.R @@ -80,6 +80,16 @@ describe("Creating legacy mungebits using the munge function", { attr(iris2, "mungepieces") <- NULL expect_equal(iris2, iris[-c(1,2)]) }) + + test_that("it should be able to create a legacy mungebit using the third munge format", { + legacy_fn <- function(df, ...) { + eval.parent(substitute({ df[[1]] <- NULL })) + } + class(legacy_fn) <- "legacy_mungebit_function" + iris2 <- munge(iris, list(list(train = list(legacy_fn, "foo"), predict = list(legacy_fn, "bar")))) + attr(iris2, "mungepieces") <- NULL + expect_equal(iris2, iris[-1L]) + }) }) From f6d39299d0c46a1930d9e43453f089d035f2c67f Mon Sep 17 00:00:00 2001 From: RobertZK Date: Sun, 17 Apr 2016 14:19:05 -0500 Subject: [PATCH 4/9] Add failing test for third munge format using legacy mungebits.s --- R/parse_mungepiece.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/parse_mungepiece.R b/R/parse_mungepiece.R index 9cad0b7..49e7c09 100644 --- a/R/parse_mungepiece.R +++ b/R/parse_mungepiece.R @@ -298,7 +298,7 @@ parse_mungepiece_dual <- function(args) { ## This is the format we need to use the `mungebit` and `mungepiece` ## constructors. - do.call(mungepiece$new, c(list(do.call(mungebit$new, args[[1L]])), args[[2L]])) + do.call(create_mungepiece, c(args[[1L]], args[[2L]])) } ## We perform [type dispatch](http://adv-r.had.co.nz/OO-essentials.html#s3) to From 49edd58af029d11412495e06e0d49095048a6165 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Sun, 17 Apr 2016 14:20:14 -0500 Subject: [PATCH 5/9] Fix rocco doc to attribute create_mungepiece --- R/parse_mungepiece.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/parse_mungepiece.R b/R/parse_mungepiece.R index 49e7c09..54d497d 100644 --- a/R/parse_mungepiece.R +++ b/R/parse_mungepiece.R @@ -296,8 +296,9 @@ parse_mungepiece_dual <- function(args) { args <- Map(list, parse_mungepiece_dual_chunk(args$train, type = "train"), parse_mungepiece_dual_chunk(args$predict, type = "predict")) - ## This is the format we need to use the `mungebit` and `mungepiece` - ## constructors. + ## We use the `create_mungepiece` helper defined below to ensure this + ## construction works for new and [legacy](https://github.com/robertzk/mungebits) + ## mungepieces. do.call(create_mungepiece, c(args[[1L]], args[[2L]])) } From bfd016bef4c0782ebc557a68c161f92e50e52411 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Wed, 27 Apr 2016 21:03:23 -0500 Subject: [PATCH 6/9] Test that passing an environment with a data key works correctly with the munge function. --- R/munge.R | 15 +++++++++++++-- tests/testthat/test-munge.R | 17 +++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/R/munge.R b/R/munge.R index 6efe513..1438b78 100644 --- a/R/munge.R +++ b/R/munge.R @@ -303,7 +303,7 @@ munge_ <- function(data, mungelist, stagerunner, list_output, parse) { ## by the `mungepiece_stages` helper. stages <- mungepiece_stages(mungelist) if (is.environment(data)) { - context <- data + context <- normalize_environment(data) } else { context <- list2env(list(data = data), parent = emptyenv()) } @@ -430,12 +430,23 @@ legacy_mungepiece_stage_body <- function() { }) } +normalize_environment <- function(env) { + ## For compatibility with [objectdiff](https://github.com/robertzk/objectdiff), + ## we use its special-purpose `ls`. + if (is(env, "tracked_environment") && + is.element("objectdiff", loadedNamespaces())) { + getFromNamespace("environment", "objectdiff")(env) + } else { + env + } +} + environment_has_data <- function(env) { ## For compatibility with [objectdiff](https://github.com/robertzk/objectdiff), ## we use its special-purpose `ls`. if (is(env, "tracked_environment") && is.element("objectdiff", loadedNamespaces())) { - any(getFromNamespace("ls", "objectdiff")(data) == "data") + any(getFromNamespace("ls", "objectdiff")(env) == "data") } else { any(ls(env) == "data") } diff --git a/tests/testthat/test-munge.R b/tests/testthat/test-munge.R index 3004590..e1b5a71 100644 --- a/tests/testthat/test-munge.R +++ b/tests/testthat/test-munge.R @@ -10,6 +10,22 @@ describe("Invalid inputs", { test_that("when munging against a data.frame it must have a mungepieces attribute", { expect_error(munge(iris, beaver2), "must have a ") }) + + test_that("when passing an environment it contains a data key", { + env <- list2env(list(foo = iris)) + expect_error(munge(env, identity), "is.data.frame") + env <- list2env(list(data = iris)) + munge(env, list(list(identity))) + }) + + test_that("when passing a tracked_environment it contains a data key", { + if (requireNamespace("objectdiff", quietly = TRUE)) { + env <- objectdiff::tracked_environment(list2env(list(foo = iris))) + expect_error(munge(env, identity), "is.data.frame") + env <- objectdiff::tracked_environment(list2env(list(data = iris))) + munge(env, list(list(identity))) + } + }) }) test_that("it does nothing when no mungepieces are passed", { @@ -182,3 +198,4 @@ describe("using mungepieces with inputs", { }) }) + From dc2dd93cb3a2feff7adf3cc713e77ef109e5bf7a Mon Sep 17 00:00:00 2001 From: RobertZK Date: Wed, 27 Apr 2016 21:19:59 -0500 Subject: [PATCH 7/9] Test that mungepiece names are preserved. --- R/parse_mungepiece.R | 4 +++- tests/testthat/test-munge.R | 12 ++++++++++++ tests/testthat/test-parse_mungepiece.R | 8 +++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/R/parse_mungepiece.R b/R/parse_mungepiece.R index 54d497d..aab658b 100644 --- a/R/parse_mungepiece.R +++ b/R/parse_mungepiece.R @@ -235,7 +235,9 @@ #' # The munge function uses the attached "mungepieces" attribute, a list of #' # trained mungepieces. parse_mungepiece <- function(args) { - if (is.mungepiece(args) || is.mungebit(args)) { args <- list(args) } + if (is.mungepiece(args) || is.mungebit(args) || is.function(args)) { + args <- list(args) + } if (length(args) == 1L && is.mungepiece(args[[1L]])) { ## We duplicate the mungepiece to avoid training it. diff --git a/tests/testthat/test-munge.R b/tests/testthat/test-munge.R index e1b5a71..f850de2 100644 --- a/tests/testthat/test-munge.R +++ b/tests/testthat/test-munge.R @@ -95,6 +95,18 @@ describe("it can procure the mungepieces list", { }) }) +test_that("mungepiece names are preserved", { + iris2 <- munge(iris, list("Step 1" = list(identity), "Step 2" = list(identity))) + expect_equal(names(attr(iris2, "mungepieces")), c("Step 1", "Step 2")) +}) + +test_that("mungepiece names are preserved for legacy mungebits", { + legacy_function <- function(x) { x } + class(legacy_function) <- c("legacy_mungebit_function", class(legacy_function)) + iris2 <- munge(iris, list("Step 1" = list(legacy_function), "Step 2" = list(legacy_function))) + expect_equal(names(attr(iris2, "mungepieces")), c("Step 1", "Step 2")) +}) + describe("using mungepieces with inputs", { simple_imputer <- function(...) { diff --git a/tests/testthat/test-parse_mungepiece.R b/tests/testthat/test-parse_mungepiece.R index dc1cb60..92ff871 100644 --- a/tests/testthat/test-parse_mungepiece.R +++ b/tests/testthat/test-parse_mungepiece.R @@ -3,7 +3,7 @@ context("parse_mungepiece") describe("Invalid inputs", { test_that("it breaks when it does not receive a list", { expect_error(parse_mungepiece(5)) - expect_error(parse_mungepiece(identity)) + expect_error(parse_mungepiece(NULL)) expect_error(parse_mungepiece(iris)) }) @@ -37,6 +37,12 @@ train_fn <- function(data, by = 2) { predict_fn <- function(data, ...) { data[[1]] <- input$by * data[[1]]; data } +test_that("it can receive a simple function", { + piece <- parse_mungepiece(identity) + piece2 <- mungepiece$new(mungebit$new(identity)) + expect_same_piece(piece, piece2) +}) + describe("First format", { test_that("it correctly creates a mungepiece using the first format with no additional arguments", { piece <- parse_mungepiece(list(train_fn, 2)) From 447451c9def32d800a846fc102565c1bb62d85d7 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Wed, 27 Apr 2016 21:25:22 -0500 Subject: [PATCH 8/9] Ensure parsing a single legacy mungebit works. --- tests/testthat/test-parse_mungepiece.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/testthat/test-parse_mungepiece.R b/tests/testthat/test-parse_mungepiece.R index 92ff871..2dc940f 100644 --- a/tests/testthat/test-parse_mungepiece.R +++ b/tests/testthat/test-parse_mungepiece.R @@ -43,6 +43,13 @@ test_that("it can receive a simple function", { expect_same_piece(piece, piece2) }) +test_that("it can parse a pre-existing legacy mungebit", { + if ("mungebits" %in% row.names(installed.packages())) { + legacy_mungebit <- mungebits:::mungebit$new(function(x) { x }) + parse_mungepiece(list(legacy_mungebit)) + } +}) + describe("First format", { test_that("it correctly creates a mungepiece using the first format with no additional arguments", { piece <- parse_mungepiece(list(train_fn, 2)) From 835a3d0011b83981cdd2a1c26dd3731a9293ba12 Mon Sep 17 00:00:00 2001 From: RobertZK Date: Wed, 27 Apr 2016 22:00:05 -0500 Subject: [PATCH 9/9] check that munging works with a stagerunner generated for an objectdiff tracked environment --- R/munge.R | 6 +++++- tests/testthat/test-munge.R | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/munge.R b/R/munge.R index 1438b78..14d7fd9 100644 --- a/R/munge.R +++ b/R/munge.R @@ -303,7 +303,11 @@ munge_ <- function(data, mungelist, stagerunner, list_output, parse) { ## by the `mungepiece_stages` helper. stages <- mungepiece_stages(mungelist) if (is.environment(data)) { - context <- normalize_environment(data) + if (identical(stagerunner, FALSE)) { + context <- normalize_environment(data) + } else { + context <- data + } } else { context <- list2env(list(data = data), parent = emptyenv()) } diff --git a/tests/testthat/test-munge.R b/tests/testthat/test-munge.R index f850de2..f4bf1e4 100644 --- a/tests/testthat/test-munge.R +++ b/tests/testthat/test-munge.R @@ -107,6 +107,17 @@ test_that("mungepiece names are preserved for legacy mungebits", { expect_equal(names(attr(iris2, "mungepieces")), c("Step 1", "Step 2")) }) +test_that("munging works with a stagerunner generated for an objectdiff tracked environment", { + if (requireNamespace("objectdiff", quietly = TRUE)) { + env <- objectdiff::tracked_environment(list2env(list(data = iris))) + runner <- munge(env, list("Step 1" = list(identity)), stagerunner = list(remember = TRUE)) + runner$run(1) + result <- runner$context$data + attr(result, "mungepieces") <- NULL + expect_equal(result, iris) + } +}) + describe("using mungepieces with inputs", { simple_imputer <- function(...) {