Help with multi-GPU training #5314

Ayadx · 2024-06-21T21:10:34Z

Hi, am trying to use multi-GPU training using kaggle with two Tesla T4.
my code only runs on 1 GPU, the other are not utilized.
I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.

i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"

Full runnable code or full changes you made:

`
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode

Define the training script content

script_content = """
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode

Unregister the datasets if they are already registered

for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]:
if d in DatasetCatalog.list():
DatasetCatalog.remove(d)
if d in MetadataCatalog.list():
MetadataCatalog.remove(d)

def load_coco_json(json_file, image_root, dataset_name):
with open(json_file) as f:
imgs_anns = json.load(f)

dataset_dicts = []
for img_ann in imgs_anns["images"]:
    record = {}
    record["file_name"] = os.path.join(image_root, img_ann["file_name"])
    record["image_id"] = img_ann["id"]
    record["height"] = img_ann["height"]
    record["width"] = img_ann["width"]

    objs = []
    for ann in imgs_anns["annotations"]:
        if ann["image_id"] != img_ann["id"]:
            continue
        obj = {
            "bbox": ann["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": ann["category_id"] - 1,  # Subtract 1 to make the category_id 0-based
            "iscrowd": ann["iscrowd"]
        }
        objs.append(obj)
    record["annotations"] = objs
    dataset_dicts.append(record)
return dataset_dicts

def register_datasets():
DatasetCatalog.register(
"pv_anomaly_train",
lambda: load_coco_json(
"/kaggle/working/0PVProjects/Univpm_DataSet/labels/train_annotations.json",
"/kaggle/working/0PVProjects/Univpm_DataSet/images/train_combined_data",
"pv_anomaly_train"
)
)
MetadataCatalog.get("pv_anomaly_train").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_val",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/val_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/val",
        "pv_anomaly_val"
    )
)
MetadataCatalog.get("pv_anomaly_val").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_test",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/test_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/test",
        "pv_anomaly_test"
    )
)
MetadataCatalog.get("pv_anomaly_test").set(thing_classes=["anomaly"])

# Retrieve metadata to ensure it is set correctly
pv_anomaly_metadata = MetadataCatalog.get("pv_anomaly_train")
pv_anomaly_metadata1 = MetadataCatalog.get("pv_anomaly_val")
print(pv_anomaly_metadata)
print(pv_anomaly_metadata1)

def set_multiprocessing_start_method():
try:
mp.set_start_method('spawn', force=True)
except RuntimeError as e:
if "context has already been set" in str(e):
print("Multiprocessing context already set, continuing without changing start method.")
else:
raise

class PrintMetricsHook(HookBase):
def init(self, cfg):
self.cfg = cfg

def after_step(self):
    # Every iteration
    iter_num = self.trainer.iter
    # Print the metrics for every iteration
    metrics = self.trainer.storage.latest()

    # Format and print the metrics
    print(f"\\nIteration {iter_num} Metrics:")
    print(f"{'-'*40}")

    for key, (value, _) in metrics.items():
        print(f"{key}: {value:.4f}")

    print(f"{'-'*40}")

class MyTrainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg, dataset_name):
return COCOEvaluator(dataset_name, cfg, False, output_dir=cfg.OUTPUT_DIR)

def main():
register_datasets()
set_multiprocessing_start_method()

# Get default config
cfg = get_cfg()

# Load Faster R-CNN with "ResNeXt-101-32x8d model trained with Caffe2 at FB" backbone pre-trained on COCO
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))

# Set training and validation datasets
cfg.DATASETS.TRAIN = ("pv_anomaly_train",)  # Training dataset
cfg.DATASETS.TEST = ("pv_anomaly_val",)     # Validation dataset

# Number of data loading workers
cfg.DATALOADER.NUM_WORKERS = 2

# Set weights for pre-trained model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")

# Number of images per batch
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduced batch size to fit into memory

# Base learning rate
cfg.SOLVER.BASE_LR = 0.00025

# Maximum number of iterations in detectron2, epoch is MAX_ITER * BATCH_SIZE / TOTAL_NUM_IMAGES
cfg.SOLVER.MAX_ITER = 3000

# ROI Heads batch size per image
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128

# Number of classes (in this case, only 1: 'anomaly')
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1


# Create output directory if it doesn't exist
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Set the maximum split size to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Create a MyTrainer instance
trainer = MyTrainer(cfg)

# Add the custom hook to print metrics after each iteration
trainer.register_hooks([PrintMetricsHook(cfg)])

# Resume training if checkpoint exists, otherwise start from scratch
trainer.resume_or_load(resume=False)

# Start training
trainer.train()

# Run evaluation
evaluator = COCOEvaluator("pv_anomaly_val", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "pv_anomaly_val")
inference_on_dataset(trainer.model, val_loader, evaluator)

if name == "main":
main()
"""

Write the script to a file

script_path = '/kaggle/working/train_yolo.py'
with open(script_path, 'w') as f:
f.write(script_content)

Define the command to run the training script using torch.distributed.run

train_command = f"""
python -m torch.distributed.run --nproc_per_node=2 {script_path}
"""

Execute the training command

os.system(train_command)

`
best regards!

The text was updated successfully, but these errors were encountered:

github-actions · 2024-06-21T21:10:45Z

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template.
The following information is missing: "Instructions To Reproduce the Issue and Full Logs"; "Your Environment";

Programmer-RD-AI · 2024-06-22T07:24:54Z

Hi,
Check the Issues #2442 & #2473
Example on How to implement multi GPU training
Hope this helps, If there are further questions please feel free to comment :)
Best regards,
Ranuga

github-actions bot added needs-more-info More info is needed to complete the issue and removed needs-more-info More info is needed to complete the issue labels Jun 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Help with multi-GPU training #5314

Help with multi-GPU training #5314

Ayadx commented Jun 21, 2024 •

edited

Loading

github-actions bot commented Jun 21, 2024

Programmer-RD-AI commented Jun 22, 2024

Help with multi-GPU training #5314

Help with multi-GPU training #5314

Comments

Ayadx commented Jun 21, 2024 • edited Loading

Define the training script content

Unregister the datasets if they are already registered

Write the script to a file

Define the command to run the training script using torch.distributed.run

Execute the training command

github-actions bot commented Jun 21, 2024

Programmer-RD-AI commented Jun 22, 2024

Ayadx commented Jun 21, 2024 •

edited

Loading