Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: DataLoader worker (pid(s) 10156, 4764, 12160, 20160) exited unexpectedly #569

Open
hizwj opened this issue Jun 29, 2024 · 0 comments

Comments

@hizwj
Copy link

hizwj commented Jun 29, 2024

At the beginning of training, it was normal:

[NanoDet][06-29 15:22:19]INFO:Train|Epoch1/30|Iter0(1/8)| mem:2.61G| lr:1.00e-07| loss_qfl:0.5918| loss_bbox:1.1593| loss_dfl:0.5198| aux_loss_qfl:0.5985| aux_loss_bbox:1.1496| aux_loss_dfl:0.5269|
INFO:NanoDet:Train|Epoch1/30|Iter0(1/8)| mem:2.61G| lr:1.00e-07| loss_qfl:0.5918| loss_bbox:1.1593| loss_dfl:0.5198| aux_loss

but every time I reached the 10th epoch, It would report an error:
Traceback (most recent call last):
File "", line 1, in
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main
prepare(preparation_data)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in run_module_code
run_code(code, mod_globals, init_globals,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in run_code
exec(code, run_globals)
File "E:\py_project\nanodet-main\tools\train.py", line 19, in
import pytorch_lightning as pl
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning_init
.py", line 34, in
from lightning_fabric.utilities.seed import seed_everything # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric_init
.py", line 23, in
from lightning_fabric.fabric import Fabric # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in
import torch
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch_init.py", line 128, in
raise err
OSError: [WinError 1455] 页面文件太小,无法完成操作。 Error loading "D:\Anaconda\envs\nanodet\lib\site-packages\torch\lib\cufft64_10.dll" or one of its dependencies.
Traceback (most recent call last):
File "", line 1, in
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main
prepare(preparation_data)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in run_module_code
run_code(code, mod_globals, init_globals,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in run_code
exec(code, run_globals)
File "E:\py_project\nanodet-main\tools\train.py", line 19, in
import pytorch_lightning as pl
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning_init
.py", line 34, in
from lightning_fabric.utilities.seed import seed_everything # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric_init
.py", line 23, in
from lightning_fabric.fabric import Fabric # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in
import torch
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch_init.py", line 676, in
from .storage import StorageBase, TypedStorage, LegacyStorage, UntypedStorage
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\storage.py", line 11, in
import numpy as np
File "D:\Anaconda\envs\nanodet\lib\site-packages\numpy_init
.py", line 154, in
from . import ma
File "D:\Anaconda\envs\nanodet\lib\site-packages\numpy\ma_init
.py", line 42, in
from . import core
File "", line 991, in _find_and_load
Traceback (most recent call last):
File "", line 1, in
File "", line 975, in _find_and_load_unlocked
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main
File "", line 671, in _load_unlocked
exitcode = _main(fd, parent_sentinel)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main
prepare(preparation_data)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare
File "", line 839, in exec_module
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path
File "", line 934, in get_code
main_content = runpy.run_path(main_path,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "", line 1033, in get_data
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in run_module_code
run_code(code, mod_globals, init_globals,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in run_code
MemoryError
exec(code, run_globals)
File "E:\py_project\nanodet-main\tools\train.py", line 19, in
import pytorch_lightning as pl
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning_init
.py", line 34, in
from lightning_fabric.utilities.seed import seed_everything # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric_init
.py", line 23, in
from lightning_fabric.fabric import Fabric # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in
import torch
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch_init.py", line 218, in
from torch._C import * # noqa: F403
RuntimeError: MemoryError: Out of memory interning an attribute name
Traceback (most recent call last):
File "", line 1, in
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main
prepare(preparation_data)
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in fixup_main_from_path
main_content = runpy.run_path(main_path,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path
return run_module_code(code, init_globals, run_name,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in run_module_code
run_code(code, mod_globals, init_globals,
File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in run_code
exec(code, run_globals)
File "E:\py_project\nanodet-main\tools\train.py", line 19, in
import pytorch_lightning as pl
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning_init
.py", line 34, in
from lightning_fabric.utilities.seed import seed_everything # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric_init
.py", line 23, in
from lightning_fabric.fabric import Fabric # noqa: E402
File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in
import torch
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch_init.py", line 831, in
from .functional import * # noqa: F403
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\functional.py", line 8, in
import torch.nn.functional as F
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn_init.py", line 1, in
from .modules import * # noqa: F403
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules_init.py", line 18, in
from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d, SyncBatchNorm,
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules\batchnorm.py", line 9, in
from .functions import SyncBatchNorm as sync_batch_norm
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules_functions.py", line 4, in
from torch.autograd.function import Function
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd_init
.py", line 21, in
from . import functional
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd\functional.py", line 3, in
from . import forward_ad as fwAD
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd\forward_ad.py", line 106, in
_UnpackedDualTensor = namedtuple('UnpackedDualTensor', ['primal', 'tangent'])
File "D:\Anaconda\envs\nanodet\lib\collections_init
.py", line 394, in namedtuple
exec(s, namespace)
MemoryError
[NanoDet][06-29 15:28:13]INFO:Val|Epoch10/30|Iter80(1/2)| mem:2.79G| lr:1.58e-04| loss_qfl:0.9156| loss_bbox:0.9682| loss_dfl:0.4420| aux_loss_qfl:0.4186| aux_loss_bbox:0.7197| aux_loss_dfl:0.3550|
INFO:NanoDet:Val|Epoch10/30|Iter80(1/2)| mem:2.79G| lr:1.58e-04| loss_qfl:0.9156| loss_bbox:0.9682| loss_dfl:0.4420| aux_loss_qfl:0.4186| aux_loss_bbox:0.7197| aux_loss_dfl:0.3550|

Traceback (most recent call last):
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1120, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "D:\Anaconda\envs\nanodet\lib\queue.py", line 178, in get
raise Empty
_queue.Empty

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "tools/train.py", line 156, in
main(args)
File "tools/train.py", line 151, in main
trainer.fit(task, train_dataloader, val_dataloader, ckpt_path=model_resume_path)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1112, in _run
results = self._run_stage()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1191, in _run_stage
self._run_train()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1214, in _run_train
self.fit_loop.run()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run
self.advance(*args, **kwargs)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 200, in run
self.on_advance_end()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\training_epoch_loop.py", line 250, in on_advance_end
self._run_validation()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\training_epoch_loop.py", line 308, in _run_validation
self.val_loop.run()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run
self.advance(*args, **kwargs)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\dataloader\evaluation_loop.py", line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run
self.advance(*args, **kwargs)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\evaluation_epoch_loop.py", line 121, in advance
batch = next(data_fetcher)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 184, in next
return self.fetching_function()
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 265, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 280, in _fetch_next_batch
batch = next(iterator)
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 628, in next
data = self._next_data()
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1316, in _next_data
idx, data = self._get_data()
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1272, in _get_data
success, data = self._try_get_data()
File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1133, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 10156, 4764, 12160, 20160) exited unexpectedly

what wrong with it?what should i do to solve this problem?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant