diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6a4375b1..6a4189a8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -61,6 +61,7 @@ jobs: matrix: label: - ext/differentiationinterface + - ext/mooncake version: - '1' - 'min' diff --git a/AGENTS.md b/AGENTS.md index 139496fe..8a03e58a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,6 +32,8 @@ AbstractPPL.jl provides shared interfaces and utilities for probabilistic progra - Full package tests: `julia --project=. -e 'using Pkg; Pkg.test()'` - Docs: `julia --project=docs docs/make.jl` +Always refresh each environment (`Pkg.update()` / `up`) before tests or doc builds — a stale manifest can cause subtle resolution and loading issues. + Run the smallest relevant test first, then broaden when changing public interfaces, extensions, or downstream-facing behaviour. Do not weaken tests just to make CI pass. ## Documentation diff --git a/HISTORY.md b/HISTORY.md index 8d8e0e49..4cc01a81 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,19 @@ +## 0.15.0 + +New evaluator-preparation and AD interface: `prepare` binds a callable to a sample input (vector or `NamedTuple`); `value_and_gradient!!` / `value_and_jacobian!!` return value-and-derivative pairs from the resulting `Prepared` wrapper. The `!!` suffix signals the returned derivative may alias the cache — copy if you need to keep it. + +```julia +using ADTypes, Mooncake # or DifferentiationInterface + ForwardDiff +using AbstractPPL: prepare, value_and_gradient!! +prepared = prepare(AutoMooncake(), x -> -0.5 * sum(abs2, x), zeros(3)) +val, grad = value_and_gradient!!(prepared, [1.0, 2.0, 3.0]) +# val == -7.0; grad == [-1.0, -2.0, -3.0] +``` + +Two new AD-backend extensions ship with it: `AbstractPPLDifferentiationInterfaceExt` (any DI backend) and `AbstractPPLMooncakeExt` (`AutoMooncake`, `AutoMooncakeForward`). `AbstractPPLTestExt` gains a conformance harness via `generate_testcases` / `run_testcases` (reserved groups: `:vector`, `:namedtuple`, `:edge`, `:cache_reuse`). + +See [`docs/src/evaluators.md`](docs/src/evaluators.md) for the full interface, the `check_dims` and `context::Tuple` options, the `NamedTuple` input path, and extension-author guidance. + ## 0.14.2 Fix string serialisation of VarNames such that the order of keyword arguments is preserved (this was previously guaranteed, but JSON.jl v1.5.0 introduced a change that caused the keyword arguments to always be sorted.) diff --git a/Project.toml b/Project.toml index e767cf77..0e7ce54d 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,7 @@ uuid = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf" keywords = ["probabilistic programming"] license = "MIT" desc = "Common interfaces for probabilistic programming" -version = "0.14.3" +version = "0.15" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -21,11 +21,13 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [extensions] AbstractPPLDifferentiationInterfaceExt = ["DifferentiationInterface"] AbstractPPLDistributionsExt = ["Distributions", "LinearAlgebra"] +AbstractPPLMooncakeExt = ["Mooncake"] AbstractPPLTestExt = ["Test"] [compat] @@ -39,6 +41,7 @@ Distributions = "0.25" JSON = "0.19 - 0.21, 1" LinearAlgebra = "<0.0.1, 1" MacroTools = "0.5" +Mooncake = "0.5.27" OrderedCollections = "1.8.1" Random = "1.6" StatsBase = "0.32, 0.33, 0.34" diff --git a/docs/src/evaluators.md b/docs/src/evaluators.md index 4f1e1512..9dd6f8af 100644 --- a/docs/src/evaluators.md +++ b/docs/src/evaluators.md @@ -138,6 +138,23 @@ library invokes the inner callable many times with same-length dual arrays derived from a single user-supplied `x`; re-validating on each invocation would be redundant work in the hot path. +## Constant context arguments + +When the underlying callable naturally takes the form `f(x, context...)` — +where everything after `x` is constant state — pass `context` as a tuple to +the vector form of `prepare`. AD differentiates only w.r.t. `x`; every +value in `context` is treated as inactive: + +```julia +affine(x, scale, offset) = scale * sum(x) + offset +prepared = prepare(adtype, affine, zeros(3); context=(2.0, 1.0)) +val, grad = value_and_gradient!!(prepared, [1.0, 2.0, 3.0]) +# val == 2.0 * 6.0 + 1.0; grad == [2.0, 2.0, 2.0] +``` + +`prepared(x)` evaluates `f(x, context...)`, and `context=()` (the default) +preserves the unary `f(x)` shape. + ## Without an AD backend The two-argument form `prepare(problem, x)` is available without any AD diff --git a/ext/AbstractPPLDifferentiationInterfaceExt.jl b/ext/AbstractPPLDifferentiationInterfaceExt.jl index 80f95ee1..1f4cfe8b 100644 --- a/ext/AbstractPPLDifferentiationInterfaceExt.jl +++ b/ext/AbstractPPLDifferentiationInterfaceExt.jl @@ -1,100 +1,131 @@ module AbstractPPLDifferentiationInterfaceExt using AbstractPPL: AbstractPPL -using AbstractPPL.Evaluators: Evaluators, Prepared, VectorEvaluator +using AbstractPPL.Evaluators: Evaluators, Prepared, VectorEvaluator, _ad_output_arity using ADTypes: AbstractADType, AutoReverseDiff using DifferentiationInterface: DifferentiationInterface as DI -# Differentiate only `x`; the evaluator is passed as a `DI.Constant` context so -# that in DynamicPPL the model and other evaluator state stay constant. -@inline _call_evaluator(x, evaluator) = evaluator(x) +# AD target used by both `DICache` modes. `Vararg{Any,N}` with a free `N` +# forces specialization on the trailing arity (a bare `Vararg{Any}` would +# skip it). DI invokes this as `_call_evaluator(x, f, c1, …, cN)` on the +# constants path, and as `_call_evaluator(x, evaluator)` (via `Fix2`) on +# the closure path — empty `ctx` then makes the splat a no-op. +@inline _call_evaluator(x, f::F, ctx::Vararg{Any,N}) where {F,N} = f(x, ctx...) -struct DICache{F,GP,JP} +# `Mode` tags the cache shape: +# * `:closure` — compiled-tape ReverseDiff: target is a `Fix2` closure, +# the AD call passes **0** `DI.Constant`s. +# * `N::Int` — constants path: `N == length(evaluator.context)`, the +# AD call passes **N + 1** `DI.Constant`s (`f` plus the +# `N` context values). +# Encoding `Mode` in the type resolves the dispatch in `_di_value_and_*` +# at compile time without a runtime branch. +struct DICache{Mode,F,GP,JP} target::F gradient_prep::GP jacobian_prep::JP - use_context::Bool + function DICache{Mode}(target::F, gp::GP, jp::JP) where {Mode,F,GP,JP} + return new{Mode,F,GP,JP}(target, gp, jp) + end end # Compiled ReverseDiff only reuses a compiled tape on the one-argument path; # `DI.Constant` deactivates tape recording, so close the evaluator into the -# target and call DI without contexts. +# target and call DI without constants. Context (if any) is captured inside +# the evaluator closure rather than lowered out — the lowered path would also +# require a closure here, so the wrapper cost is unavoidable for compiled tapes. function _prepare_di(prep::F, adtype::AutoReverseDiff{true}, x, evaluator) where {F} target = Base.Fix2(_call_evaluator, evaluator) - return target, prep(target, adtype, x), false + return target, prep(target, adtype, x), Val(:closure) end function _prepare_di(prep::F, adtype::AbstractADType, x, evaluator) where {F} - return _call_evaluator, prep(_call_evaluator, adtype, x, DI.Constant(evaluator)), true + constants = (DI.Constant(evaluator.f), map(DI.Constant, evaluator.context)...) + return ( + _call_evaluator, + prep(_call_evaluator, adtype, x, constants...), + Val(length(evaluator.context)), + ) end +@inline _wrap_cache(target, gp, jp, ::Val{Mode}) where {Mode} = + DICache{Mode}(target, gp, jp) + function AbstractPPL.prepare( - adtype::AbstractADType, problem, x::AbstractVector{<:Real}; check_dims::Bool=true + adtype::AbstractADType, + problem, + x::AbstractVector{<:Real}; + check_dims::Bool=true, + context::Tuple=(), ) - evaluator = AbstractPPL.prepare(problem, x; check_dims)::VectorEvaluator - y = evaluator(x) - y isa Union{Number,AbstractVector} || throw( - ArgumentError( - "A prepared AD evaluator must return a scalar or AbstractVector; got $(typeof(y)).", - ), - ) + evaluator = AbstractPPL.prepare(problem, x; check_dims, context)::VectorEvaluator + arity = _ad_output_arity(evaluator(x)) if length(x) == 0 - # DI prep crashes on length-0 input (e.g. ForwardDiff `BoundsError`); the - # `Val(0)` sentinel keeps the `gradient_prep === nothing` arity check meaningful. - gp, jp = y isa Number ? (Val(0), nothing) : (nothing, Val(0)) - return Prepared(adtype, evaluator, DICache(_call_evaluator, gp, jp, true)) + # DI prep crashes on length-0 input (e.g. ForwardDiff `BoundsError`). + # `Val(0)` is an arity sentinel for the `gradient_prep === nothing` + # check below; the AD entry short-circuits before any DI call. + gp, jp = arity === :scalar ? (Val(0), nothing) : (nothing, Val(0)) + cache = _wrap_cache(_call_evaluator, gp, jp, Val(length(context))) + return Prepared(adtype, evaluator, cache) end - if y isa Number - target, gradient_prep, use_context = _prepare_di( - DI.prepare_gradient, adtype, x, evaluator - ) + if arity === :scalar + target, gradient_prep, mode = _prepare_di(DI.prepare_gradient, adtype, x, evaluator) return Prepared( - adtype, evaluator, DICache(target, gradient_prep, nothing, use_context) + adtype, evaluator, _wrap_cache(target, gradient_prep, nothing, mode) ) end - target, jacobian_prep, use_context = _prepare_di( - DI.prepare_jacobian, adtype, x, evaluator - ) - return Prepared(adtype, evaluator, DICache(target, nothing, jacobian_prep, use_context)) + target, jacobian_prep, mode = _prepare_di(DI.prepare_jacobian, adtype, x, evaluator) + return Prepared(adtype, evaluator, _wrap_cache(target, nothing, jacobian_prep, mode)) end +# Hot-path dispatch is by `Mode` (closure vs constants), resolved at compile +# time. The unconstrained method matches every non-`:closure` `Mode` (i.e. +# any `Int N`); `:closure` is strictly more specific and wins for compiled +# tapes. On the constants path we always pass `DI.Constant(eval.f)` plus the +# `N` context constants — `N == 0` collapses the `map` splat to nothing. +@inline _di_value_and_gradient(c::DICache{:closure}, ad, x, _) = + DI.value_and_gradient(c.target, c.gradient_prep, ad, x) +@inline _di_value_and_gradient(c::DICache, ad, x, eval) = DI.value_and_gradient( + c.target, + c.gradient_prep, + ad, + x, + DI.Constant(eval.f), + map(DI.Constant, eval.context)..., +) + +@inline _di_value_and_jacobian(c::DICache{:closure}, ad, x, _) = + DI.value_and_jacobian(c.target, c.jacobian_prep, ad, x) +@inline _di_value_and_jacobian(c::DICache, ad, x, eval) = DI.value_and_jacobian( + c.target, + c.jacobian_prep, + ad, + x, + DI.Constant(eval.f), + map(DI.Constant, eval.context)..., +) + @inline function AbstractPPL.value_and_gradient!!( p::Prepared{<:AbstractADType,<:VectorEvaluator,<:DICache}, x::AbstractVector{T} ) where {T<:Real} - p.cache.gradient_prep === nothing && - throw(ArgumentError("`value_and_gradient!!` requires a scalar-valued function.")) - T <: Integer && Evaluators._reject_integer_input(x) - Evaluators._check_vector_length(p.evaluator.dim, x) + p.cache.gradient_prep === nothing && Evaluators._throw_gradient_needs_scalar() + Evaluators._check_ad_input(p.evaluator, x) # Bypass DI on length-0 input — DI prep paths fail (e.g. ForwardDiff # `BoundsError`); typed `T[]` matches the caller's element type. length(x) == 0 && return (p.evaluator(x), T[]) - return if p.cache.use_context - DI.value_and_gradient( - p.cache.target, p.cache.gradient_prep, p.adtype, x, DI.Constant(p.evaluator) - ) - else - DI.value_and_gradient(p.cache.target, p.cache.gradient_prep, p.adtype, x) - end + return _di_value_and_gradient(p.cache, p.adtype, x, p.evaluator) end @inline function AbstractPPL.value_and_jacobian!!( p::Prepared{<:AbstractADType,<:VectorEvaluator,<:DICache}, x::AbstractVector{T} ) where {T<:Real} - p.cache.jacobian_prep === nothing && - throw(ArgumentError("`value_and_jacobian!!` requires a vector-valued function.")) - T <: Integer && Evaluators._reject_integer_input(x) - Evaluators._check_vector_length(p.evaluator.dim, x) + p.cache.jacobian_prep === nothing && Evaluators._throw_jacobian_needs_vector() + Evaluators._check_ad_input(p.evaluator, x) if length(x) == 0 val = p.evaluator(x) return (val, similar(x, length(val), 0)) end - return if p.cache.use_context - DI.value_and_jacobian( - p.cache.target, p.cache.jacobian_prep, p.adtype, x, DI.Constant(p.evaluator) - ) - else - DI.value_and_jacobian(p.cache.target, p.cache.jacobian_prep, p.adtype, x) - end + return _di_value_and_jacobian(p.cache, p.adtype, x, p.evaluator) end end # module diff --git a/ext/AbstractPPLMooncakeExt.jl b/ext/AbstractPPLMooncakeExt.jl new file mode 100644 index 00000000..b07af8ae --- /dev/null +++ b/ext/AbstractPPLMooncakeExt.jl @@ -0,0 +1,209 @@ +module AbstractPPLMooncakeExt + +using AbstractPPL: AbstractPPL +using AbstractPPL.Evaluators: + Evaluators, Prepared, VectorEvaluator, NamedTupleEvaluator, _ad_output_arity +using ADTypes: AutoMooncake, AutoMooncakeForward +using Mooncake: Mooncake + +const _MooncakeAD = Union{AutoMooncake,AutoMooncakeForward} + +# `NamedTupleEvaluator` is the callable on the NamedTuple path; `NoTangent` +# stops Mooncake from deriving a `Tangent{NamedTuple{...}}` for its fields +# on every backward pass. The `VectorEvaluator` override is a defensive +# guard — vector preps no longer pass the evaluator wrapper to Mooncake. +Mooncake.tangent_type(::Type{<:VectorEvaluator}) = Mooncake.NoTangent +Mooncake.tangent_type(::Type{<:NamedTupleEvaluator}) = Mooncake.NoTangent + +# Type parameters: +# +# * `A::Symbol` — output arity, `:scalar` or `:vector`. Drives the +# gradient/jacobian dispatch and the arity-mismatch errors. +# * `C` — the underlying Mooncake cache, or `Nothing` for the +# empty-input shortcut. +struct MooncakeCache{A,C} + cache::C +end +MooncakeCache{A}(cache::C) where {A,C} = MooncakeCache{A,C}(cache) + +_mooncake_config(adtype) = adtype.config === nothing ? Mooncake.Config() : adtype.config + +# NamedTuple-path helper: Mooncake exposes separate `prepare_*_cache` +# entries per AD mode but the call shape (target + values) is the same. +function _mooncake_gradient_cache(::AutoMooncake, f, x; config) + return Mooncake.prepare_gradient_cache(f, x; config) +end +function _mooncake_gradient_cache(::AutoMooncakeForward, f, x; config) + return Mooncake.prepare_derivative_cache(f, x; config) +end + +function AbstractPPL.prepare( + adtype::_MooncakeAD, problem, values::NamedTuple; check_dims::Bool=true +) + evaluator = AbstractPPL.prepare(problem, values; check_dims)::NamedTupleEvaluator + config = _mooncake_config(adtype) + cache = _mooncake_gradient_cache(adtype, evaluator, values; config) + return Prepared(adtype, evaluator, cache) +end + +""" + prepare(adtype::AutoMooncake, problem, x; check_dims=true, context::Tuple=()) + prepare(adtype::AutoMooncakeForward, problem, x; check_dims=true, context::Tuple=()) + +Prepare a Mooncake gradient/Jacobian evaluator for a dense vector input. + +Non-`DenseVector` inputs (views, strided slices) are rejected: Mooncake +assumes a contiguous primal and otherwise returns shape-incorrect tangents +on reverse mode and crashes on forward/Jacobian paths. + +`context` follows the base `prepare` contract — the prepared evaluator +computes `problem(x, context...)` with AD differentiating only `x`. One +Mooncake-specific restriction: vector-valued problems require `context=()`. + +Empty input (`length(x) == 0`) is supported with any `context`; Mooncake +builds no tape for zero-length `x`, so the prepared evaluator's AD entry +short-circuits to `(problem(x, context...), eltype(x)[])` without invoking +Mooncake. +""" +function AbstractPPL.prepare( + adtype::_MooncakeAD, + problem, + x::AbstractVector{<:Real}; + check_dims::Bool=true, + context::Tuple=(), +) + x isa DenseVector || throw( + ArgumentError( + "AutoMooncake / AutoMooncakeForward require a dense vector input " * + "(e.g. `Vector{<:Real}`); got $(typeof(x)). Wrap non-dense inputs " * + "(views, strided slices) with `collect` before calling `prepare`.", + ), + ) + evaluator = AbstractPPL.prepare(problem, x; check_dims, context)::VectorEvaluator + arity = _ad_output_arity(evaluator(x)) + config = _mooncake_config(adtype) + if !isempty(evaluator.context) && arity !== :scalar + throw( + ArgumentError( + "Non-empty `context` is only supported for scalar-valued problems." + ), + ) + end + # Mooncake builds no tape for length-zero `x`; tag with `Nothing` so the + # empty-input methods below shortcut without invoking Mooncake. Empty `x` + # with non-empty context also routes here — the hot-path shortcut just + # calls `p.evaluator(x)` which already does `f([], context...)`. + length(x) == 0 && return Prepared(adtype, evaluator, MooncakeCache{arity}(nothing)) + # Compile the tape on the evaluator's `f` and `context` (not the raw + # `problem` / `context` kwargs): a downstream override of structural + # `prepare` may return a `VectorEvaluator` whose `.f`/`.context` differ + # from the caller-supplied values, and the hot path reads them off the + # evaluator. Forward mode uses `prepare_derivative_cache` for both + # arities; the splat is a no-op for vector arity (empty `context`). + cache = if adtype isa AutoMooncake + if arity === :scalar + Mooncake.prepare_gradient_cache(evaluator.f, x, evaluator.context...; config) + else + Mooncake.prepare_pullback_cache(evaluator.f, x; config) + end + else + Mooncake.prepare_derivative_cache(evaluator.f, x, evaluator.context...; config) + end + return Prepared(adtype, evaluator, MooncakeCache{arity}(cache)) +end + +# Input-shape validation is delegated to the AD backend: Mooncake catches +# top-level NamedTuple-type mismatches, and the inner +# `NamedTupleEvaluator{CheckInput}` callable catches nested-array size +# mismatches (gated by `check_dims`). Running `_assert_namedtuple_shape` +# again here would duplicate the second check on every AD call. +# (`∂f` is `NoTangent` thanks to the `tangent_type` overload above.) +@inline function AbstractPPL.value_and_gradient!!( + p::Prepared{<:_MooncakeAD,<:NamedTupleEvaluator}, values::NamedTuple +) + val, (_, grad) = Mooncake.value_and_gradient!!(p.cache, p.evaluator, values) + return (val, grad) +end + +# Empty-input shortcut. `MooncakeCache{:scalar,Nothing}` is strictly more +# specific than `MooncakeCache{:scalar}` on `C`, so dispatch unambiguously +# selects this method over the general scalar-gradient hot path below for +# zero-length `x`. +@inline function AbstractPPL.value_and_gradient!!( + p::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:scalar,Nothing}}, + x::AbstractVector{T}, +) where {T<:Real} + Evaluators._check_ad_input(p.evaluator, x) + return (p.evaluator(x), T[]) +end + +# Scalar-gradient hot path. Reverse mode (`Mooncake.Cache`) needs +# `args_to_zero` to mark `x` as the lone active input (`false` on `f`, +# `true` on `x`, `false` on each context value); forward mode +# (`ForwardCache`) derives activity from its seeded argument and rejects +# the kwarg. The `p.adtype isa AutoMooncake` branch is compile-folded +# since `adtype`'s concrete type lives in `Prepared`'s type parameters. +# Empty `context` collapses the splat and reduces `args_to_zero` to +# `(false, true)`. `tangents[2]` is the `x`-gradient; trailing entries +# (one per context value) are inactive and discarded. +@inline function AbstractPPL.value_and_gradient!!( + p::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:scalar}}, + x::AbstractVector{T}, +) where {T<:Real} + Evaluators._check_ad_input(p.evaluator, x) + e = p.evaluator + val, tangents = if p.adtype isa AutoMooncake + Mooncake.value_and_gradient!!( + p.cache.cache, + e.f, + x, + e.context...; + args_to_zero=(false, true, map(_ -> false, e.context)...), + ) + else + Mooncake.value_and_gradient!!(p.cache.cache, e.f, x, e.context...) + end + return (val, tangents[2]) +end + +# Arity-mismatch errors as dedicated methods so dispatch on +# `MooncakeCache{:scalar}` vs `{:vector}` resolves at compile time instead of +# a runtime check on the cache contents. +@inline function AbstractPPL.value_and_gradient!!( + ::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:vector}}, + ::AbstractVector{<:Real}, +) + return Evaluators._throw_gradient_needs_scalar() +end + +@inline function AbstractPPL.value_and_jacobian!!( + ::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:scalar}}, + ::AbstractVector{<:Real}, +) + return Evaluators._throw_jacobian_needs_vector() +end + +# Empty-input jacobian shortcut. Same `Nothing` specificity trick as the +# scalar case above; skips Mooncake entirely. +@inline function AbstractPPL.value_and_jacobian!!( + p::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:vector,Nothing}}, + x::AbstractVector{T}, +) where {T<:Real} + Evaluators._check_ad_input(p.evaluator, x) + val = p.evaluator(x) + return (val, similar(x, length(val), 0)) +end + +@inline function AbstractPPL.value_and_jacobian!!( + p::Prepared{<:_MooncakeAD,<:VectorEvaluator,<:MooncakeCache{:vector}}, + x::AbstractVector{T}, +) where {T<:Real} + Evaluators._check_ad_input(p.evaluator, x) + # Vector arity rejects non-empty `context` at prepare time, so the tape + # is compiled on `problem(x)` and there is no splat or `args_to_zero` to + # propagate. Mooncake's `value_and_jacobian!!` returns `(val, jac)` + # directly with `x` as the only active argument. + return Mooncake.value_and_jacobian!!(p.cache.cache, p.evaluator.f, x) +end + +end # module diff --git a/ext/AbstractPPLTestExt.jl b/ext/AbstractPPLTestExt.jl index 6f7463da..3ac01060 100644 --- a/ext/AbstractPPLTestExt.jl +++ b/ext/AbstractPPLTestExt.jl @@ -93,7 +93,15 @@ function AbstractPPL.generate_testcases(::Val{:edge}) zeros(3), [2.0, 3.0, 4.0], (prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x), - ArgumentError, + r"scalar-valued", + ), + ErrorCase( + "jacobian of scalar output", + QuadraticProblem(), + zeros(3), + [3.0, 1.0, 2.0], + (prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x), + r"vector-valued", ), ErrorCase( "gradient of vector-valued output, empty input", @@ -146,6 +154,20 @@ function AbstractPPL.generate_testcases(::Val{:edge}) ) end +function AbstractPPL.generate_testcases(::Val{:namedtuple}) + return ( + ValueCase( + "scalar output over (x::Real, y::Vector)", + vs -> vs.x^2 + sum(abs2, vs.y), + (x=0.0, y=zeros(2)), + (x=3.0, y=[1.0, 2.0]), + 14.0, + (x=6.0, y=[2.0, 4.0]), + nothing, + ), + ) +end + function AbstractPPL.run_testcases( ::Val{:vector}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10 ) @@ -178,4 +200,57 @@ function AbstractPPL.run_testcases(::Val{:edge}, prepare_fn=AbstractPPL.prepare; return nothing end +function AbstractPPL.run_testcases( + ::Val{:namedtuple}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10 +) + for case in generate_testcases(Val(:namedtuple)) + @testset "$(case.name)" begin + prepared = prepare_fn(adtype, case.f, case.x_proto) + @test prepared(case.x) ≈ case.value atol = atol rtol = rtol + if case.gradient !== nothing + val, grad = AbstractPPL.value_and_gradient!!(prepared, case.x) + @test val ≈ case.value atol = atol rtol = rtol + for k in keys(case.gradient) + @test getproperty(grad, k) ≈ getproperty(case.gradient, k) atol = atol rtol = + rtol + end + end + end + end + return nothing +end + +# Drive `value_and_{gradient,jacobian}!!` twice with different inputs against +# the same `prepared` evaluator to exercise cache reuse — catches backends +# whose cache state is corrupted by a prior call. +function AbstractPPL.run_testcases( + ::Val{:cache_reuse}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10 +) + @testset "scalar output, repeated calls" begin + prepared = prepare_fn(adtype, QuadraticProblem(), zeros(3)) + for (x, value, gradient) in ( + ([1.0, 2.0, 3.0], 14.0, [2.0, 4.0, 6.0]), + ([4.0, 5.0, 6.0], 77.0, [8.0, 10.0, 12.0]), + ([0.5, -1.0, 2.0], 5.25, [1.0, -2.0, 4.0]), + ) + val, grad = AbstractPPL.value_and_gradient!!(prepared, x) + @test val ≈ value atol = atol rtol = rtol + @test grad ≈ gradient atol = atol rtol = rtol + end + end + @testset "vector output, repeated calls" begin + prepared = prepare_fn(adtype, VectorValuedProblem(), zeros(3)) + for (x, value, jacobian) in ( + ([2.0, 3.0, 4.0], [6.0, 7.0], [3.0 2.0 0.0; 0.0 1.0 1.0]), + ([5.0, 1.0, 7.0], [5.0, 8.0], [1.0 5.0 0.0; 0.0 1.0 1.0]), + ([0.0, 4.0, -2.0], [0.0, 2.0], [4.0 0.0 0.0; 0.0 1.0 1.0]), + ) + val, jac = AbstractPPL.value_and_jacobian!!(prepared, x) + @test val ≈ value atol = atol rtol = rtol + @test jac ≈ jacobian atol = atol rtol = rtol + end + end + return nothing +end + end # module diff --git a/src/AbstractPPL.jl b/src/AbstractPPL.jl index 0b50ca9a..c70b349d 100644 --- a/src/AbstractPPL.jl +++ b/src/AbstractPPL.jl @@ -19,8 +19,9 @@ using .Evaluators: prepare, value_and_gradient!!, value_and_jacobian!! Return a tuple of test cases for the conformance `group`. Implemented by the `Test` extension (`AbstractPPLTestExt`). Reserved group keys (extensions must not redefine these): `:vector` for value/gradient/jacobian round-trips on -vector-input evaluators; `:edge` for error-path cases. Downstream packages may -add other keys. +vector-input evaluators; `:namedtuple` for `NamedTuple`-input evaluators; +`:edge` for error-path cases; `:cache_reuse` for repeated calls against a +single prepared evaluator. Downstream packages may add other keys. """ function generate_testcases end diff --git a/src/evaluators/Evaluators.jl b/src/evaluators/Evaluators.jl index a88f2910..dfae89f3 100644 --- a/src/evaluators/Evaluators.jl +++ b/src/evaluators/Evaluators.jl @@ -41,8 +41,8 @@ Prepared(adtype::AbstractADType, evaluator) = Prepared(adtype, evaluator, nothin """ prepare(problem, values::NamedTuple; check_dims::Bool=true) - prepare(problem, x::AbstractVector{<:Real}; check_dims::Bool=true) - prepare(adtype, problem, x::AbstractVector{<:Real}; check_dims::Bool=true) + prepare(problem, x::AbstractVector{<:Real}; check_dims::Bool=true, context::Tuple=()) + prepare(adtype, problem, x::AbstractVector{<:Real}; check_dims::Bool=true, context::Tuple=()) Prepare a callable evaluator for `problem`. @@ -56,6 +56,11 @@ the input shape on each call. Pass `check_dims=false` to skip the per-call check, e.g. inside an AD backend's hot path where the input shape is already guaranteed. +The vector-input forms accept a `context::Tuple` of constant arguments threaded +through to `problem`: the prepared evaluator computes `problem(x, context...)`, +and AD backends differentiate only with respect to `x`. `context=()` (the +default) preserves the unary `problem(x)` contract. + The three-argument AD-aware form may invoke `problem` once during preparation to detect output arity (scalar vs vector) and select gradient or jacobian machinery accordingly. Avoid `prepare` calls when `problem` has side effects @@ -69,8 +74,10 @@ function prepare end function prepare(problem, values::NamedTuple; check_dims::Bool=true) return NamedTupleEvaluator{check_dims}(problem, values) end -function prepare(problem, x::AbstractVector{<:Real}; check_dims::Bool=true) - return VectorEvaluator{check_dims}(problem, length(x)) +function prepare( + problem, x::AbstractVector{<:Real}; check_dims::Bool=true, context::Tuple=() +) + return VectorEvaluator{check_dims}(problem, length(x), context) end """ @@ -93,8 +100,8 @@ The Jacobian has shape `(length(value), length(x))`. function value_and_jacobian!! end """ - VectorEvaluator{CheckInput}(f, dim) - VectorEvaluator(f, dim) # equivalent to `VectorEvaluator{true}(f, dim)` + VectorEvaluator{CheckInput}(f, dim, context::Tuple=()) + VectorEvaluator(f, dim, context::Tuple=()) # equivalent to `VectorEvaluator{true}(f, dim, context)` Evaluator shape for scalar functions of a vector input. Part of the extension author API; end users interact with the wrapping `Prepared` instead. @@ -105,20 +112,28 @@ author API; end users interact with the wrapping `Prepared` instead. where input shape is already guaranteed and the runtime check would persist in the dual/shadow hot path. +`context` is a tuple of constant arguments threaded through to `f`: +`evaluator(x)` computes `f(x, context...)`. AD backends treat every value in +`context` as inactive and differentiate only with respect to `x`. The default +empty tuple keeps the unary `f(x)` contract. + A bare `VectorEvaluator` is *not* differentiable; gradient capability is the contract of the wrapping `Prepared` returned by `prepare(adtype, ...)`. """ -struct VectorEvaluator{CheckInput,F} +struct VectorEvaluator{CheckInput,F,C<:Tuple} f::F dim::Int - function VectorEvaluator{CheckInput}(f::F, dim::Int) where {CheckInput,F} + context::C + function VectorEvaluator{CheckInput}( + f::F, dim::Int, context::C=() + ) where {CheckInput,F,C<:Tuple} CheckInput isa Bool || throw(ArgumentError("`CheckInput` must be a Bool.")) dim >= 0 || throw(ArgumentError("`dim` must be non-negative, got $dim.")) - return new{CheckInput,F}(f, dim) + return new{CheckInput,F,C}(f, dim, context) end end -VectorEvaluator(f, dim::Int) = VectorEvaluator{true}(f, dim) +VectorEvaluator(f, dim::Int, context::Tuple=()) = VectorEvaluator{true}(f, dim, context) """ NamedTupleEvaluator{CheckInput}(f, inputspec) @@ -177,15 +192,30 @@ function _check_vector_length(dim::Int, x) return nothing end +# Shared input validation for AD-backend `value_and_{gradient,jacobian}!!` entry +# points. Same compile-time `T <: Integer` elision as the `VectorEvaluator` body. +# Gated by `CheckInput`: the `{false}` overload is a no-op so the AD hot path +# pays nothing when the caller has already validated the input (e.g. via +# `prepare(...; check_dims=false)`). +function _check_ad_input(e::VectorEvaluator{true}, x::AbstractVector{T}) where {T} + T <: Integer && _reject_integer_input(x) + _check_vector_length(e.dim, x) + return nothing +end +_check_ad_input(::VectorEvaluator{false}, ::AbstractVector) = nothing + +# Both bodies rely on `T <: Integer` being a static check so the AD hot path +# (Float/dual `T`) elides the branch; the `{false}` callable additionally skips +# `_check_vector_length` since AD libraries pass length-matching dual inputs. function (e::VectorEvaluator{true})(x::AbstractVector{T}) where {T} T <: Integer && _reject_integer_input(x) _check_vector_length(e.dim, x) - return e.f(x) + return e.f(x, e.context...) end function (e::VectorEvaluator{false})(x::AbstractVector{T}) where {T} T <: Integer && _reject_integer_input(x) - return e.f(x) + return e.f(x, e.context...) end function (e::NamedTupleEvaluator{true})(values::NamedTuple) @@ -195,13 +225,15 @@ end (e::NamedTupleEvaluator{false})(values::NamedTuple) = e.f(values) """ - _assert_namedtuple_shape(e::NamedTupleEvaluator, values) + _assert_namedtuple_shape(e::NamedTupleEvaluator{true}, values) Throw `ArgumentError` unless `values` has the same type as the prototype captured during preparation, including matching `size` for any nested `AbstractArray` leaves. Also throws if the prototype contains a leaf type outside the supported -set (`Real`, `Complex`, `AbstractArray`, `Tuple`, `NamedTuple`). No-op when `e` -was constructed with `CheckInput=false`. +set (`Real`, `Complex`, `AbstractArray`, `Tuple`, `NamedTuple`). + +Gated by `CheckInput`: the `{false}` overload is a no-op so AD hot paths and +other opt-out callers pay nothing. """ function _assert_namedtuple_shape(e::NamedTupleEvaluator{true}, values) typeof(values) === typeof(e.inputspec) || throw( @@ -218,6 +250,30 @@ function _assert_namedtuple_shape(e::NamedTupleEvaluator{true}, values) end _assert_namedtuple_shape(::NamedTupleEvaluator{false}, _) = nothing +# Classify the output of a probe `evaluator(x)` call into the two arities the +# AD interface supports — `:scalar` routes to gradient prep, `:vector` to +# jacobian prep. Shared by the DI and Mooncake extensions so both surface the +# same error message for unsupported output types. +function _ad_output_arity(y) + y isa Number && return :scalar + y isa AbstractVector && return :vector + throw( + ArgumentError( + "A prepared AD evaluator must return a scalar or AbstractVector; got $(typeof(y)).", + ), + ) +end + +# Arity-mismatch errors shared by the DI and Mooncake extensions; kept here so +# the `:edge` testcase regexes (`r"scalar-valued"`, `r"vector-valued"`) pin a +# single error string instead of one per backend. +function _throw_gradient_needs_scalar() + throw(ArgumentError("`value_and_gradient!!` requires a scalar-valued function.")) +end +function _throw_jacobian_needs_vector() + throw(ArgumentError("`value_and_jacobian!!` requires a vector-valued function.")) +end + # Complements the `typeof` check above: same-typed arrays can differ in `size`. # Arrays with non-`Real`/`Complex` eltype are walked element-wise to catch # inner mismatches. Unknown leaves throw, mirroring the supported-leaves diff --git a/test/Project.toml b/test/Project.toml index 1bd57d0d..122f7e4e 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -16,7 +16,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] ADTypes = "1" -AbstractPPL = "0.14" +AbstractPPL = "0.15" Accessors = "0.1" Aqua = "0.8" DimensionalData = "0.29, 0.30" diff --git a/test/evaluators/Evaluators.jl b/test/evaluators/Evaluators.jl index c8454438..8b96d445 100644 --- a/test/evaluators/Evaluators.jl +++ b/test/evaluators/Evaluators.jl @@ -72,6 +72,21 @@ end # Unsupported leaf types are rejected rather than silently passing. ne_string = AbstractPPL.Evaluators.NamedTupleEvaluator(x -> length(x.s), (s="abc",)) @test_throws r"Supported leaves" ne_string((s="abcde",)) + + # `_check_ad_input` is dispatch-gated by `CheckInput` so the AD hot + # path pays nothing when the evaluator was prepared with + # `check_dims=false`. + ve_checked = AbstractPPL.Evaluators.VectorEvaluator{true}(sum, 3) + @test AbstractPPL.Evaluators._check_ad_input(ve_checked, [1.0, 2.0, 3.0]) === + nothing + @test_throws DimensionMismatch AbstractPPL.Evaluators._check_ad_input( + ve_checked, [1.0, 2.0] + ) + @test_throws r"floating-point" AbstractPPL.Evaluators._check_ad_input( + ve_checked, [1, 2, 3] + ) + @test AbstractPPL.Evaluators._check_ad_input(ve_unchecked, [1.0, 2.0]) === nothing + @test AbstractPPL.Evaluators._check_ad_input(ve_unchecked, [1, 2, 3]) === nothing end @testset "prepare (structural)" begin @@ -102,6 +117,15 @@ end pv_unchecked = prepare(sum, zeros(3); check_dims=false) @test pv_unchecked isa VectorEvaluator{false} @test pv_unchecked([1.0, 2.0]) == 3.0 # wrong length, no error + + # `context` threads constant args through to the callable; AD-unaware + # `prepare` constructs the `VectorEvaluator` with the same shape and + # `prepared(x)` evaluates `f(x, context...)`. + affine(x, a, b) = sum(x) * a + b + pv_ctx = prepare(affine, zeros(2); context=(2.0, 1.0)) + @test pv_ctx isa VectorEvaluator{true} + @test pv_ctx.context === (2.0, 1.0) + @test pv_ctx([3.0, 4.0]) == 15.0 end @testset "prepare (AD-aware)" begin diff --git a/test/ext/differentiationinterface/main.jl b/test/ext/differentiationinterface/main.jl index 4e19a8ce..636f237e 100644 --- a/test/ext/differentiationinterface/main.jl +++ b/test/ext/differentiationinterface/main.jl @@ -3,27 +3,58 @@ Pkg.activate(@__DIR__) Pkg.develop(; path=joinpath(@__DIR__, "..", "..", "..")) Pkg.instantiate() -using AbstractPPL: AbstractPPL, run_testcases +using AbstractPPL: AbstractPPL, prepare, run_testcases, value_and_gradient!! using ADTypes: AutoForwardDiff, AutoReverseDiff -using DifferentiationInterface +using DifferentiationInterface: DifferentiationInterface as DI using ForwardDiff using ReverseDiff using Test +const DIExt = Base.get_extension(AbstractPPL, :AbstractPPLDifferentiationInterfaceExt) + +quadratic(x::AbstractVector{<:Real}) = sum(xi -> xi^2, x) + @testset "AbstractPPLDifferentiationInterfaceExt" begin - run_testcases(Val(:vector); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6) - run_testcases(Val(:edge); adtype=AutoForwardDiff()) + @testset "ForwardDiff" begin + run_testcases(Val(:vector); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6) + run_testcases(Val(:cache_reuse); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6) + run_testcases(Val(:edge); adtype=AutoForwardDiff()) + end + + # Compiled-tape ReverseDiff goes through the `_prepare_di(::AutoReverseDiff{true}, …)` + # specialisation that closes the evaluator into a `Base.Fix2` target — the + # `:cache_reuse` group exercises that path across multiple inputs. + @testset "ReverseDiff (compiled tape)" begin + adtype = AutoReverseDiff(; compile=true) + run_testcases(Val(:vector); adtype=adtype, atol=1e-6, rtol=1e-6) + run_testcases(Val(:cache_reuse); adtype=adtype, atol=1e-6, rtol=1e-6) + run_testcases(Val(:edge); adtype=adtype) + end + + # `DICache`'s `Mode` parameter is either `:closure` (compiled-tape + # ReverseDiff) or the integer context length on the constants path. The + # constants-path integer also documents how many `DI.Constant`s the AD + # call passes. + @testset "DICache encodes the call mode as a type parameter" begin + x = [1.0, 2.0, 3.0] + prep_noctx = prepare(AutoForwardDiff(), quadratic, x) + prep_closure = prepare(AutoReverseDiff(; compile=true), quadratic, x) + affine(y, a, b) = a * sum(abs2, y) + b + prep_ctx = prepare(AutoForwardDiff(), affine, x; context=(2.0, 1.0)) - @testset "AutoReverseDiff compiled tape (no-context path)" begin - ad = AutoReverseDiff(; compile=true) - p_scalar = AbstractPPL.prepare(ad, x -> sum(abs2, x), zeros(3)) - p_vector = AbstractPPL.prepare(ad, x -> [x[1] * x[2], x[2] + x[3]], zeros(3)) + @test prep_noctx.cache isa DIExt.DICache{0} + @test prep_closure.cache isa DIExt.DICache{:closure} + @test prep_ctx.cache isa DIExt.DICache{2} - @test !p_scalar.cache.use_context - @test !isnothing(p_scalar.cache.gradient_prep.tape) - @test !p_vector.cache.use_context - @test !isnothing(p_vector.cache.jacobian_prep.tape) + # Non-empty-context primal matches the underlying `f(x, context...)`. + @test prep_ctx(x) == affine(x, 2.0, 1.0) + val, grad = value_and_gradient!!(prep_ctx, x) + @test val == affine(x, 2.0, 1.0) + @test grad ≈ [4.0, 8.0, 12.0] # 2 * 2x - run_testcases(Val(:vector); adtype=ad, atol=1e-6, rtol=1e-6) + # Hot path is type-stable on all three preps. + @inferred value_and_gradient!!(prep_noctx, x) + @inferred value_and_gradient!!(prep_closure, x) + @inferred value_and_gradient!!(prep_ctx, x) end end diff --git a/test/ext/mooncake/Project.toml b/test/ext/mooncake/Project.toml new file mode 100644 index 00000000..6a5c2039 --- /dev/null +++ b/test/ext/mooncake/Project.toml @@ -0,0 +1,11 @@ +[deps] +AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf" +ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" +Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +ADTypes = "1" +Mooncake = "0.5.27" +julia = "1.10" diff --git a/test/ext/mooncake/main.jl b/test/ext/mooncake/main.jl new file mode 100644 index 00000000..3c328a1a --- /dev/null +++ b/test/ext/mooncake/main.jl @@ -0,0 +1,94 @@ +using Pkg +Pkg.activate(@__DIR__) +Pkg.develop(; path=joinpath(@__DIR__, "..", "..", "..")) +Pkg.instantiate() + +using AbstractPPL: AbstractPPL, prepare, run_testcases, value_and_gradient!! +using ADTypes: AutoMooncake, AutoMooncakeForward +using Mooncake +using Test + +@testset "AbstractPPLMooncakeExt" begin + for (label, adtype) in ( + ("Mooncake (reverse)", AutoMooncake()), + ("Mooncake (forward)", AutoMooncakeForward()), + ) + @testset "$label" begin + run_testcases(Val(:vector); adtype=adtype, atol=1e-6, rtol=1e-6) + run_testcases(Val(:namedtuple); adtype=adtype, atol=1e-6, rtol=1e-6) + run_testcases(Val(:cache_reuse); adtype=adtype, atol=1e-6, rtol=1e-6) + run_testcases(Val(:edge); adtype=adtype) + end + end + + @testset "context-lowered gradient" begin + struct TinyProblem{T} + offset::T + end + raw_logdensity(x::AbstractVector{<:Real}, offset) = -0.5 * (x[1] - offset)^2 + (p::TinyProblem)(x::AbstractVector{<:Real}) = raw_logdensity(x, p.offset) + + x = [0.3] + problem = TinyProblem(0.1) + ad = AutoMooncake(; config=nothing) + + generic = prepare(ad, problem, x; check_dims=false) + lowered = prepare( + ad, raw_logdensity, x; check_dims=false, context=(problem.offset,) + ) + + # `prepared(x)` evaluates `problem(x)` on the generic path and + # `raw_logdensity(x, context...)` on the lowered path; both should + # produce the same scalar. + @test generic(x) == problem(x) + @test lowered(x) == problem(x) + + # Same value and gradient as the generic path. + @test value_and_gradient!!(generic, x) == value_and_gradient!!(lowered, x) + + # Forward mode supports context too — same primal and (approximately) + # the same derivative as the reverse-mode lowered path on this scalar + # problem. Use `≈` because forward and reverse may differ in the last + # ULPs. + ad_fwd = AutoMooncakeForward(; config=nothing) + lowered_fwd = prepare( + ad_fwd, raw_logdensity, x; check_dims=false, context=(problem.offset,) + ) + @test lowered_fwd(x) == problem(x) + val_fwd, grad_fwd = value_and_gradient!!(lowered_fwd, x) + val_rev, grad_rev = value_and_gradient!!(lowered, x) + @test val_fwd ≈ val_rev atol = 1e-12 + @test grad_fwd ≈ grad_rev atol = 1e-12 + + # Rejects on vector-valued problems with non-empty context. + vec_problem(y, c) = [y[1] * c, y[1] + c] + @test_throws ArgumentError prepare( + ad, vec_problem, x; check_dims=false, context=(1.0,) + ) + + # Empty input with non-empty context is supported — the empty-input + # shortcut bypasses Mooncake and just calls `f([], context...)`. Use + # a `sum(...; init=0.0)`-based `f` since `raw_logdensity` indexes `x[1]`. + empty_logdensity(y::AbstractVector{<:Real}, offset) = + sum(y; init=zero(eltype(y))) + offset + empty_lowered = prepare( + ad, empty_logdensity, Float64[]; check_dims=false, context=(0.5,) + ) + val0, grad0 = value_and_gradient!!(empty_lowered, Float64[]) + @test val0 == empty_logdensity(Float64[], 0.5) + @test grad0 == Float64[] + + # Jacobian on a scalar-only lowered cache surfaces our arity-mismatch error. + @test_throws r"vector-valued" AbstractPPL.value_and_jacobian!!(lowered, x) + end + + @testset "dense vector requirement" begin + # Non-dense AbstractVectors (e.g. `view`s) are rejected up front rather + # than reaching Mooncake, where reverse-mode silently returns a + # `Mooncake.Tangent` and forward/Jacobian paths crash. + problem = x -> sum(abs2, x) + v = view([1.0, 2.0, 3.0], :) + @test_throws r"dense vector" prepare(AutoMooncake(), problem, v) + @test_throws r"dense vector" prepare(AutoMooncakeForward(), problem, v) + end +end diff --git a/test/run_extras.jl b/test/run_extras.jl index e557f363..cd2c157e 100644 --- a/test/run_extras.jl +++ b/test/run_extras.jl @@ -2,8 +2,9 @@ # # Usage (from the repo root): # LABEL=ext/differentiationinterface julia test/run_extras.jl +# LABEL=ext/mooncake julia test/run_extras.jl -const VALID_LABELS = ("ext/differentiationinterface",) +const VALID_LABELS = ("ext/differentiationinterface", "ext/mooncake") label = get(ENV, "LABEL", nothing) label in VALID_LABELS ||