You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello,
Thank you for your amazing work. I had an issue while trying to train the model on 4 GPUs. The trace is as shown below. I however was able to train the model with one GPU.
Could you please help me out with this
Traceback (most recent call last):
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 55, in
main(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 35, in main
Trainer(args, save_path)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 56, in init
train_loss, train_mae = self.training(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 105, in training
outputs, edge_mask, ds_map = self.model(images)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/TRACER.py", line 38, in forward
features, edge = self.model.get_blocks(x, H, W)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 245, in get_blocks
x = block(x, drop_connect_rate=drop_connect_rate)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 122, in forward
x_squeezed = self._se_reduce(x_squeezed)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/util/effi_utils.py", line 301, in forward
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
RuntimeError: GET was unable to find an engine to execute this computation
The text was updated successfully, but these errors were encountered:
Hello,
Thank you for your amazing work. I had an issue while trying to train the model on 4 GPUs. The trace is as shown below. I however was able to train the model with one GPU.
Could you please help me out with this
Traceback (most recent call last):
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 55, in
main(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 35, in main
Trainer(args, save_path)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 56, in init
train_loss, train_mae = self.training(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 105, in training
outputs, edge_mask, ds_map = self.model(images)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/TRACER.py", line 38, in forward
features, edge = self.model.get_blocks(x, H, W)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 245, in get_blocks
x = block(x, drop_connect_rate=drop_connect_rate)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 122, in forward
x_squeezed = self._se_reduce(x_squeezed)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/util/effi_utils.py", line 301, in forward
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
RuntimeError: GET was unable to find an engine to execute this computation
The text was updated successfully, but these errors were encountered: