Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crystalformer/src/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(
self.dropout_rate = dropout_rate

if w_init is None:
w_init = hk.initializers.VarianceScaling(w_init_scale)
w_init = hk.initializers.VarianceScaling(1.0)
self.w_init = w_init
self.with_bias = with_bias
self.b_init = b_init
Expand Down
55 changes: 55 additions & 0 deletions tests/test_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Tests for MultiHeadAttention -- focuses on the w_init=None fix.

Before the fix, calling MultiHeadAttention with w_init=None (the default)
raised NameError: name 'w_init_scale' is not defined.
"""
from config import *

from crystalformer.src.attention import MultiHeadAttention


def test_w_init_none_does_not_raise():
"""w_init=None (the default) must not raise NameError.

Regression test for: NameError: name 'w_init_scale' is not defined.
The fix replaces the undefined variable with the literal value 1.0,
matching the upstream haiku default for VarianceScaling.
"""
def fn(q, k, v):
mha = MultiHeadAttention(
num_heads=2,
key_size=8,
model_size=16,
w_init=None, # default -- was broken before fix
)
return mha(q, k, v)

f = hk.without_apply_rng(hk.transform(fn))
key = jax.random.PRNGKey(0)
x = jax.random.normal(key, (4, 16))
params = f.init(key, x, x, x)
out = f.apply(params, x, x, x)

assert out.shape == (4, 16)
assert jnp.isfinite(out).all(), "w_init=None path produces NaN/Inf"


def test_w_init_explicit_still_works():
"""Explicit w_init continues to work after the fix."""
def fn(q, k, v):
mha = MultiHeadAttention(
num_heads=2,
key_size=8,
model_size=16,
w_init=hk.initializers.VarianceScaling(1.0),
)
return mha(q, k, v)

f = hk.without_apply_rng(hk.transform(fn))
key = jax.random.PRNGKey(1)
x = jax.random.normal(key, (4, 16))
params = f.init(key, x, x, x)
out = f.apply(params, x, x, x)

assert out.shape == (4, 16)
assert jnp.isfinite(out).all()