Skip to content

Error during multi-GPU training with DataParallel #396

@Uzheng1224

Description

@Uzheng1224

I encountered an error while attempting multi-GPU training. How should I resolve this issue, and is multi-GPU training supported? Thank you.

Traceback (most recent call last):
File "/root/autodl-tmp/MEAN/train_university.py", line 495, in
train_loss = train(config,
File "/root/autodl-tmp/MEAN/sample4geo/trainer.py", line 48, in train
output1, output2 = model(query, reference)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.10/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(*input, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/model.py", line 50, in forward
y1 = self.model_1(x1)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/SHA_make_model.py", line 371, in forward
part_features = self.vmamba(x) # [B, C, H, W]
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/backbone.py", line 146, in forward
y = self.m(x)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 1616, in forward
o, x = layer_forward(layer, x) # (B, H, W, C)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 1609, in layer_forward
x = l.blocks(x)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/container.py", line 215, in forward
input = module(input)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 1240, in forward
return self._forward(input)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 1228, in _forward
x = x + self.drop_path(self.op(self.norm(x)))
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 649, in forwardv2
x = self.in_proj(x)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/autodl-tmp/MEAN/sample4geo/hand_convnext/ConvNext/backbones/vmamba/vmamba.py", line 45, in forward
return F.conv2d(x, self.weight[:, :, None, None], self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA__cudnn_convolution)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions