Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions mjx/mujoco/mjx/_src/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,23 +90,20 @@ def _resolve_device(
logging.debug('Picking default device: %s.', device_0)
return device_0

if impl == types.Impl.WARP:
if has_cuda_gpu_device():
cuda_gpus = jax.devices('cuda')
logging.debug('Picking default device: %s', cuda_gpus[0])
device_0 = cuda_gpus[0]
else:
device_0 = jax.devices('cpu')[0]
return device_0

if impl == types.Impl.C or impl == types.Impl.CPP:
cpu_0 = jax.devices('cpu')[0]
logging.debug('Picking default device: %s', cpu_0)
return cpu_0

if impl == types.Impl.WARP:
# WARP implementation requires a CUDA GPU.
cuda_gpus = [d for d in jax.devices('cuda')]
if not cuda_gpus:
raise AssertionError(
'No CUDA GPU devices found in'
f' jax.devices("cuda")={jax.devices("cuda")}.'
)

logging.debug('Picking default device: %s', cuda_gpus[0])
return cuda_gpus[0]

raise ValueError(f'Unsupported implementation: {impl}')


Expand All @@ -121,9 +118,12 @@ def _check_impl_device_compatibility(
impl = types.Impl(impl)

if impl == types.Impl.WARP:
if not _is_cuda_gpu_device(device):
is_cuda_device = _is_cuda_gpu_device(device)
is_cpu_device = device.platform == 'cpu'
if not (is_cuda_device or is_cpu_device):
raise AssertionError(
f'Warp implementation requires a CUDA GPU device, got {device}.'
'Warp implementation requires a CUDA GPU or CPU device, got '
f'{device}.'
)
_check_warp_installed()

Expand Down
37 changes: 5 additions & 32 deletions mjx/mujoco/mjx/_src/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,6 @@ def setUp(self):
def test_put_model(self, xml, impl):
if impl == 'warp' and not mjxw.WARP_INSTALLED:
self.skipTest('Warp not installed.')
if impl == 'warp' and not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device available.')

m = mujoco.MjModel.from_xml_string(xml)
mx = mjx.put_model(m, impl=impl)
Expand Down Expand Up @@ -315,8 +313,6 @@ def test_put_model_warp_has_expected_shapes(self):
"""Tests that put_model produces expected shapes for MuJoCo Warp."""
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device available.')

m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONSTRAINTS)
mx = mjx.put_model(m, impl='warp')
Expand All @@ -339,8 +335,6 @@ def test_put_model_warp_graph_mode(self, mode: str | None):
"""Tests that put_model accepts graph_mode parameter."""
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device available.')

if mode is None:
graph_mode = None
Expand Down Expand Up @@ -487,8 +481,6 @@ def test_make_data(self, impl: str):
def test_make_data_warp(self):
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')
m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONVEX_OBJECTS)
d = mjx.make_data(m, impl='warp', nconmax=9, njmax=23)
self.assertEqual(d._impl.contact__dist.shape[0], 9)
Expand All @@ -500,8 +492,6 @@ def test_put_data(self, impl: str):
if impl == 'warp':
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')

m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONSTRAINTS)
d = mujoco.MjData(m)
Expand Down Expand Up @@ -623,8 +613,6 @@ def test_put_data_warp_ndim(self):
"""Tests that put_data produces expected dimensions for Warp fields."""
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')

m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONSTRAINTS)
d = mujoco.MjData(m)
Expand Down Expand Up @@ -792,8 +780,6 @@ def test_get_data_into_warp(self):
# and remove this test.
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')

m = mujoco.MjModel.from_xml_string('<mujoco></mujoco>')
d = mujoco.MjData(m)
Expand Down Expand Up @@ -869,8 +855,6 @@ def test_make_data_warp_has_expected_shapes(self):
"""Tests that make_data produces expected shapes for MuJoCo Warp."""
if not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')

m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONSTRAINTS)
dx = mjx.make_data(m, impl='warp')
Expand All @@ -893,8 +877,6 @@ def test_data_slice(self, impl):
"""Tests that slice on Data works as expected."""
if impl == 'warp' and not mjxw.WARP_INSTALLED:
self.skipTest('Warp is not installed.')
if impl == 'warp' and not mjx_io.has_cuda_gpu_device():
self.skipTest('No CUDA GPU device.')

m = mujoco.MjModel.from_xml_string(_MULTIPLE_CONSTRAINTS)
dx = jax.vmap(lambda x: mjx.make_data(m, impl=impl))(jp.arange(10))
Expand Down Expand Up @@ -956,8 +938,8 @@ def put_data(dummy_arg_for_batching):
('gpu-nvidia', 'jax', ('gpu', Impl.JAX)),
('tpu', 'jax', ('tpu', Impl.JAX)),
# WARP backend specified.
('cpu', 'warp', ('cpu', 'error')),
('gpu-notnvidia', 'warp', ('cpu', 'error')),
('cpu', 'warp', ('cpu', Impl.WARP)),
('gpu-notnvidia', 'warp', ('gpu', 'error')),
('gpu-nvidia', 'warp', ('gpu', Impl.WARP)),
('tpu', 'warp', ('tpu', 'error')),
# C backend specified.
Expand All @@ -984,10 +966,10 @@ def put_data(dummy_arg_for_batching):
('gpu-nvidia', 'jax', ('gpu', Impl.JAX)),
('tpu', 'jax', ('tpu', Impl.JAX)),
# WARP backend impl specified.
('cpu', 'warp', ('cpu', 'error')),
('gpu-notnvidia', 'warp', ('cpu', 'error')),
('cpu', 'warp', ('cpu', Impl.WARP)),
('gpu-notnvidia', 'warp', ('cpu', Impl.WARP)),
('gpu-nvidia', 'warp', ('gpu', Impl.WARP)),
('tpu', 'warp', ('tpu', 'error')),
('tpu', 'warp', ('cpu', Impl.WARP)),
# C backend impl specified, CPU should always be available.
('cpu', 'c', ('cpu', Impl.C)),
('gpu-notnvidia', 'c', ('cpu', Impl.C)),
Expand Down Expand Up @@ -1162,15 +1144,6 @@ def backends_side_effect():
self.mock_jax_backends.side_effect = backends_side_effect

expected_device, expected_impl = expected
if (
expected_impl == 'error'
and default_device_str != 'gpu-nvidia'
and impl_str == 'warp'
):
with self.assertRaisesRegex(RuntimeError, 'cuda backend not supported'):
mjx_io._resolve_impl_and_device(impl=impl_str, device=None)
return

if expected_impl == 'error':
with self.assertRaises(AssertionError):
mjx_io._resolve_impl_and_device(impl=impl_str, device=None)
Expand Down
4 changes: 2 additions & 2 deletions mjx/mujoco/mjx/third_party/mujoco_warp/_src/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2321,7 +2321,7 @@ def create_render_context(

cam_res_arr = wp.array(active_cam_res, dtype=wp.vec2i)

if render_rgb and isinstance(render_rgb, bool):
if isinstance(render_rgb, bool):
render_rgb = [render_rgb] * ncam
elif render_rgb is None:
# TODO: remove after mjwarp depends on mujoco >= 3.4.1 in pyproject.toml
Expand All @@ -2330,7 +2330,7 @@ def create_render_context(
else:
render_rgb = [True] * ncam

if render_depth and isinstance(render_depth, bool):
if isinstance(render_depth, bool):
render_depth = [render_depth] * ncam
elif render_depth is None:
# TODO: remove after mjwarp depends on mujoco >= 3.4.1 in pyproject.toml
Expand Down
89 changes: 55 additions & 34 deletions mjx/mujoco/mjx/third_party/warp/_src/jax_experimental/ffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def __init__(
ffi_ccall_address = ctypes.cast(self.callback_func, ctypes.c_void_p)
ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="CUDA")
jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="Host")

def __call__(self, *args, output_dims=None, launch_dims=None, vmap_method=None):
num_inputs = len(args)
Expand Down Expand Up @@ -297,8 +298,7 @@ def __call__(self, *args, output_dims=None, launch_dims=None, vmap_method=None):
except Exception:
# ignore unsupported devices like TPUs
pass
# we only support CUDA devices for now
if dev.is_cuda:
if dev.is_cuda or dev.is_cpu:
self.kernel.module.load(dev)

# save launch data to be retrieved by callback
Expand All @@ -320,10 +320,11 @@ def ffi_callback(self, call_frame):
metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
metadata_ext.contents.metadata.contents.api_version.major_version = 0
metadata_ext.contents.metadata.contents.api_version.minor_version = 1
# Turn on CUDA graphs for this handler.
metadata_ext.contents.metadata.contents.traits = (
XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
)
# Turn on CUDA graphs for this handler if CUDA is available.
if wp.is_cuda_available():
metadata_ext.contents.metadata.contents.traits = (
XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
)
return None

# Lock is required to prevent race conditions when callback is invoked
Expand Down Expand Up @@ -377,24 +378,36 @@ def ffi_callback(self, call_frame):
arg_refs.append(arg) # keep a reference

# get device and stream
device = wp.get_cuda_device(get_device_ordinal_from_callframe(call_frame.contents))
stream = get_stream_from_callframe(call_frame.contents)
if wp.is_cuda_available():
device = wp.get_cuda_device(get_device_ordinal_from_callframe(call_frame.contents))
stream = get_stream_from_callframe(call_frame.contents)
else:
device = wp.get_device("cpu")
stream = None

# get kernel hooks
hooks = self.kernel.module.get_kernel_hooks(self.kernel, device)
assert hooks.forward, "Failed to find kernel entry point"

# launch the kernel
wp._src.context.runtime.core.wp_cuda_launch_kernel(
device.context,
hooks.forward,
launch_bounds.size,
0,
256,
hooks.forward_smem_bytes,
kernel_params,
stream,
)
if device.is_cuda:
wp._src.context.runtime.core.wp_cuda_launch_kernel(
device.context,
hooks.forward,
launch_bounds.size,
0,
256,
hooks.forward_smem_bytes,
kernel_params,
stream,
)
else:
wp._src.context.runtime.core.wp_cpu_launch_kernel(
device.context,
hooks.forward,
launch_bounds.size,
kernel_params,
)

except Exception as e:
print(traceback.format_exc())
Expand Down Expand Up @@ -552,6 +565,7 @@ def __init__(
ffi_ccall_address = ctypes.cast(self.callback_func, ctypes.c_void_p)
ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="CUDA")
jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="Host")

def __call__(self, *args, output_dims=None, vmap_method=None):
num_inputs = len(args)
Expand Down Expand Up @@ -642,8 +656,7 @@ def __call__(self, *args, output_dims=None, vmap_method=None):
except Exception:
# ignore unsupported devices like TPUs
pass
# we only support CUDA devices for now
if dev.is_cuda:
if dev.is_cuda or dev.is_cpu:
module.load(dev)

# save call data to be retrieved by callback
Expand All @@ -664,8 +677,8 @@ def ffi_callback(self, call_frame):
metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
metadata_ext.contents.metadata.contents.api_version.major_version = 0
metadata_ext.contents.metadata.contents.api_version.minor_version = 1
# Turn on CUDA graphs for this handler.
if self.graph_mode is GraphMode.JAX:
# Turn on CUDA graphs for this handler if CUDA is available.
if self.graph_mode is GraphMode.JAX and wp.is_cuda_available():
metadata_ext.contents.metadata.contents.traits = (
XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
)
Expand All @@ -692,8 +705,12 @@ def ffi_callback(self, call_frame):
assert num_inputs == self.num_inputs
assert num_outputs == self.num_outputs

cuda_stream = get_stream_from_callframe(call_frame.contents)
device_ordinal = get_device_ordinal_from_callframe(call_frame.contents)
if wp.is_cuda_available():
cuda_stream = get_stream_from_callframe(call_frame.contents)
device_ordinal = get_device_ordinal_from_callframe(call_frame.contents)
else:
cuda_stream = None
device_ordinal = 0

if self.graph_mode == GraphMode.WARP:
# check if we already captured an identical call
Expand Down Expand Up @@ -797,9 +814,12 @@ def ffi_callback(self, call_frame):
# early out
return

device_ordinal = get_device_ordinal_from_callframe(call_frame.contents)
device = wp.get_cuda_device(device_ordinal)
stream = wp.Stream(device, cuda_stream=cuda_stream)
if wp.is_cuda_available():
device = wp.get_cuda_device(device_ordinal)
stream = wp.Stream(device, cuda_stream=cuda_stream)
else:
device = wp.get_device("cpu")
stream = None

# reconstruct the argument list
arg_list = []
Expand All @@ -824,16 +844,16 @@ def ffi_callback(self, call_frame):
arg_list.append(arr)

# call the Python function with reconstructed arguments
with wp.ScopedStream(stream, sync_enter=False):
if stream.is_capturing:
with wp.ScopedStream(stream, sync_enter=False) if stream else wp.ScopedDevice(device):
if stream and stream.is_capturing:
# capturing with JAX
with wp.ScopedCapture(external=True) as capture:
self.func(*arg_list)

# keep a reference to the capture object to prevent required modules getting unloaded
call_desc.capture = capture

elif self.graph_mode == GraphMode.WARP:
elif self.graph_mode == GraphMode.WARP and device.is_cuda:
# capturing with WARP
with wp.ScopedCapture() as capture:
self.func(*arg_list)
Expand All @@ -846,7 +866,7 @@ def ffi_callback(self, call_frame):
if self._graph_cache_max is not None and len(self.captures) > self._graph_cache_max:
self.captures.popitem(last=False)

elif self.graph_mode == GraphMode.WARP_STAGED_EX:
elif self.graph_mode == GraphMode.WARP_STAGED_EX and device.is_cuda:
# capturing with WARP using staging buffers and memcopies done outside of the graph
wp_memcpy_batch = wp._src.context.runtime.core.wp_memcpy_batch

Expand Down Expand Up @@ -889,7 +909,7 @@ def ffi_callback(self, call_frame):
# TODO: we should have a way of freeing this
call_desc.capture = capture

elif self.graph_mode == GraphMode.WARP_STAGED:
elif self.graph_mode == GraphMode.WARP_STAGED and device.is_cuda:
# capturing with WARP using staging buffers and memcopies done inside of the graph
wp_cuda_graph_insert_memcpy_batch = (
wp._src.context.runtime.core.wp_cuda_graph_insert_memcpy_batch
Expand Down Expand Up @@ -967,7 +987,7 @@ def ffi_callback(self, call_frame):
call_desc.capture = capture

else:
# not capturing
# not capturing or on CPU
self.func(*arg_list)

except Exception as e:
Expand Down Expand Up @@ -1537,7 +1557,7 @@ def ffi_callback(call_frame):
metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
metadata_ext.contents.metadata.contents.api_version.major_version = 0
metadata_ext.contents.metadata.contents.api_version.minor_version = 1
if graph_compatible:
if graph_compatible and wp.is_cuda_available():
# Turn on CUDA graphs for this handler.
metadata_ext.contents.metadata.contents.traits = (
XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
Expand Down Expand Up @@ -1576,6 +1596,7 @@ def ffi_callback(call_frame):
ffi_ccall_address = ctypes.cast(callback_func, ctypes.c_void_p)
ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
jax.ffi.register_ffi_target(name, ffi_capsule, platform="CUDA")
jax.ffi.register_ffi_target(name, ffi_capsule, platform="Host")


###############################################################################
Expand Down
Loading