diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6fe..5f323b3a89 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2353,12 +2353,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; diff --git a/criu/image.c b/criu/image.c index 9fb390ab7e..cbcd56e9b4 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,39 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +static PluginsEntry criu_plugins = PLUGINS_ENTRY__INIT; +bool enabled_plugins_inventory; + +void __attribute__((used)) set_cuda_plugin(void) +{ + criu_plugins.has_cuda = true; +} + +void __attribute__((used)) set_cuda_plugin_enabled(void) +{ + criu_plugins.cuda = true; +} + +bool __attribute__((used)) get_cuda_plugin_enabled(void) +{ + return criu_plugins.has_cuda && criu_plugins.cuda; +} + +void __attribute__((used)) set_amdgpu_plugin(void) +{ + criu_plugins.has_amdgpu = true; +} + +void __attribute__((used)) set_amdgpu_plugin_enabled(void) +{ + criu_plugins.amdgpu = true; +} + +bool __attribute__((used)) get_amdgpu_plugin_enabled(void) +{ + return criu_plugins.has_amdgpu && criu_plugins.amdgpu; +} + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +132,16 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (he->plugins == NULL) { + enabled_plugins_inventory = false; + } else { + criu_plugins.has_amdgpu = he->plugins->has_amdgpu; + criu_plugins.amdgpu = he->plugins->amdgpu; + criu_plugins.has_cuda = he->plugins->has_cuda; + criu_plugins.cuda = he->plugins->cuda; + enabled_plugins_inventory = true; + } } ret = 0; @@ -121,6 +164,7 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + he->plugins = &criu_plugins; ret = pb_write_one(img, he, PB_INVENTORY); xfree(he->root_ids); diff --git a/criu/include/image.h b/criu/include/image.h index a17aae35c2..949bc3eced 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -177,4 +177,17 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +/* For backwards compatibility, if the inventory image does not contain + * a "plugins" field, we should load all plugins. */ +extern bool enabled_plugins_inventory; + +extern void set_cuda_plugin(void); +extern void set_cuda_plugin_enabled(void); +extern bool get_cuda_plugin_enabled(void); + +extern void set_amdgpu_plugin(void); +extern void set_amdgpu_plugin_enabled(void); +extern bool get_amdgpu_plugin_enabled(void); + + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 58b5ea5bfe..47d6284085 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -205,6 +205,10 @@ int cr_plugin_init(int stage) char *path; size_t i; DIR *d; + bool cuda_plugin_required = get_cuda_plugin_enabled(); + bool cuda_plugin_loaded = false; + bool amdgpu_plugin_required = get_amdgpu_plugin_enabled(); + bool amdgpu_plugin_loaded = false; INIT_LIST_HEAD(&cr_plugin_ctl.head); for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++) @@ -247,6 +251,18 @@ int cr_plugin_init(int stage) if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3)) continue; + if (enabled_plugins_inventory && stage == CR_PLUGIN_STAGE__RESTORE) { + if (len == 16 && strncmp(de->d_name, "amdgpu_plugin.so", 16)) { + if (amdgpu_plugin_required == false) + continue; /* Skip unnecessary plugin */ + amdgpu_plugin_loaded = true; + } else if (len == 14 && strncmp(de->d_name, "cuda_plugin.so", 14)) { + if (cuda_plugin_required == false) + continue; /* Skip unnecessary plugin */ + cuda_plugin_loaded = true; + } + } + if (snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name) >= sizeof(path)) { pr_err("Unable to build plugin path\n"); goto err; @@ -256,6 +272,18 @@ int cr_plugin_init(int stage) goto err; } + if (enabled_plugins_inventory && stage == CR_PLUGIN_STAGE__RESTORE) { + if (amdgpu_plugin_required && !amdgpu_plugin_loaded) { + pr_err("AMD GPU plugin is required for restore\n"); + goto err; + } + + if (cuda_plugin_required && !cuda_plugin_loaded) { + pr_err("CUDA plugin is required for restore\n"); + goto err; + } + } + exit_code = 0; err: closedir(d); diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d0..71d213e797 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,11 @@ enum lsmtype { APPARMOR = 2; } +message plugins_entry { + optional bool amdgpu = 1; + optional bool cuda = 2; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +26,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins = 12; } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b56ba6d140..7e2e552e29 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -360,6 +360,8 @@ int amdgpu_plugin_init(int stage) kfd_max_buffer_size = 0; getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + set_amdgpu_plugin(); + return 0; } @@ -414,6 +416,10 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) if (ret) pr_perror("%s(), Can't handle VMAs of input device", __func__); + /* Set the AMD GPU plugin as enabled in the inventory image + * only if the checkpoint contains GPU state (i.e., ret == 0). */ + set_amdgpu_plugin_enabled(); + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 23c3f4b1ab..b967506775 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -334,6 +334,10 @@ int cuda_plugin_checkpoint_devices(int pid) return 0; } + /* Set the CUDA plugin as enabled in the inventory image + * only if the checkpoint contains GPU state (i.e., restore_tid != -1). */ + set_cuda_plugin_enabled(); + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); /* We need to resume the checkpoint thread to prepare the mappings for * checkpointing @@ -463,6 +467,8 @@ int cuda_plugin_init(int stage) { int ret; + set_cuda_plugin(); + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); plugin_disabled = true;