Skip to content

Commit

Permalink
do not kill everything when a single task fails
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Jul 12, 2024
1 parent 0814338 commit 84dd126
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion src/datatrove/executor/slurm_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def launch_job(self):
srun_args_str = " ".join([f"--{k}={v}" for k, v in self.srun_args.items()]) if self.srun_args else ""
launch_file_contents = self.get_launch_file_contents(
self.get_sbatch_args(max_array),
f"srun {srun_args_str} --environment=datatrove -l launch_pickled_pipeline {self.logging_dir.resolve_paths('executor.pik')}",
f"srun {srun_args_str} --kill-on-bad-exit=0 --environment=datatrove -l launch_pickled_pipeline {self.logging_dir.resolve_paths('executor.pik')}",
)
# save it
with self.logging_dir.open("launch_script.slurm", "w") as launchscript_f:
Expand Down

0 comments on commit 84dd126

Please sign in to comment.