diff --git a/composer/core/engine.py b/composer/core/engine.py index 17e5365e2a1..7035740012c 100644 --- a/composer/core/engine.py +++ b/composer/core/engine.py @@ -123,8 +123,11 @@ def sigterm_handler(signal, frame): sys.exit(128 + signal) -signal.signal(signal.SIGTERM, sigterm_handler) -signal.signal(signal.SIGINT, sigterm_handler) +try: + signal.signal(signal.SIGTERM, sigterm_handler) + signal.signal(signal.SIGINT, sigterm_handler) +except ValueError: + log.warning('Failed to set signal handler. Checkpoints may not be flushed if the process is killed.') def _get_default_passes(): diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index a7d34d3ce88..986e0eba454 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -61,8 +61,8 @@ def patch_pytorch(): from torch.distributed.fsdp import _runtime_utils _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None - elif version.parse(torch.__version__) < version.parse('2.3.0'): - # Monkey patch for torch < 2.2.2 ie torch == 2.2.1 + elif version.parse(torch.__version__) < version.parse('2.2.9'): + # Monkey patch for torch < 2.3.0 ie torch == 2.2.1/2.2.2 currently pass elif version.parse(torch.__version__) < version.parse('2.3.1'): diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index cffa4f1d7b9..34ef1732fa9 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -227,16 +227,7 @@ def __init__( metadata_destination = os.path.join(self.destination_path, '.metadata') if dist.get_local_rank() == 0: metadata_path = str(Path(source_path) / Path('.metadata')) - if isinstance(object_store, ObjectStore): - object_store.download_object( - object_name=metadata_path, - filename=metadata_destination, - ) - else: - object_store.download_file( - remote_file_name=metadata_path, - destination=metadata_destination, - ) + download_object_or_file(metadata_path, metadata_destination, object_store) dist.barrier() # FileSystemReader takes in a root directory in its constructor, which is the dir where @@ -385,16 +376,7 @@ def is_checkpoint_legacy_sharded(object_store: Optional[Union[LoggerDestination, _, _, metadata_path = parse_uri(metadata_path) with tempfile.TemporaryDirectory() as temp_dir: metadata_destination = os.path.join(str(temp_dir), '.metadata') - if isinstance(object_store, ObjectStore): - object_store.download_object( - object_name=metadata_path, - filename=metadata_destination, - ) - else: - object_store.download_file( - remote_file_name=metadata_path, - destination=metadata_destination, - ) + download_object_or_file(metadata_path, metadata_destination, object_store) return False except FileNotFoundError: return True diff --git a/setup.py b/setup.py index 675527435da..374a67c48a6 100644 --- a/setup.py +++ b/setup.py @@ -142,6 +142,7 @@ def package_files(prefix: str, directory: str, extension: str): 'cryptography==41.0.5', 'pytest-httpserver>=1.0.4,<1.1', 'setuptools<=59.5.0', + 'pillow==9.3.0', # Matches the Pillow version listed in the Dockerfile ] extra_deps['system_metrics_monitor'] = { diff --git a/tests/test_engine.py b/tests/test_engine.py index dacc13d955b..ac7f456f5b0 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,11 +1,13 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +import importlib import logging import os import subprocess import sys import textwrap +import threading from pathlib import Path from typing import List from unittest.mock import Mock @@ -323,3 +325,15 @@ def test_logging( ('composer.core.engine', 10, 'Post-closing callback EventCounterCallback'), ('composer.core.engine', 10, 'Engine closed.'), ] + + +def _worker(): + import composer.core.engine + importlib.reload(composer.core.engine) + + +def test_graceful_fallback_when_signal_handler_cannot_be_set(): + # https://github.com/mosaicml/composer/issues/3151#issue-2205981731 + t = threading.Thread(target=_worker) + t.start() + t.join()