Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Help with multi-GPU training #5314

Open
Ayadx opened this issue Jun 21, 2024 · 2 comments
Open

Help with multi-GPU training #5314

Ayadx opened this issue Jun 21, 2024 · 2 comments

Comments

@Ayadx
Copy link

Ayadx commented Jun 21, 2024

Hi, am trying to use multi-GPU training using kaggle with two Tesla T4.
my code only runs on 1 GPU, the other are not utilized.
I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.

i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"

Full runnable code or full changes you made:

`
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode

Define the training script content

script_content = """
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode

Unregister the datasets if they are already registered

for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]:
if d in DatasetCatalog.list():
DatasetCatalog.remove(d)
if d in MetadataCatalog.list():
MetadataCatalog.remove(d)

def load_coco_json(json_file, image_root, dataset_name):
with open(json_file) as f:
imgs_anns = json.load(f)

dataset_dicts = []
for img_ann in imgs_anns["images"]:
    record = {}
    record["file_name"] = os.path.join(image_root, img_ann["file_name"])
    record["image_id"] = img_ann["id"]
    record["height"] = img_ann["height"]
    record["width"] = img_ann["width"]

    objs = []
    for ann in imgs_anns["annotations"]:
        if ann["image_id"] != img_ann["id"]:
            continue
        obj = {
            "bbox": ann["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": ann["category_id"] - 1,  # Subtract 1 to make the category_id 0-based
            "iscrowd": ann["iscrowd"]
        }
        objs.append(obj)
    record["annotations"] = objs
    dataset_dicts.append(record)
return dataset_dicts

def register_datasets():
DatasetCatalog.register(
"pv_anomaly_train",
lambda: load_coco_json(
"/kaggle/working/0PVProjects/Univpm_DataSet/labels/train_annotations.json",
"/kaggle/working/0PVProjects/Univpm_DataSet/images/train_combined_data",
"pv_anomaly_train"
)
)
MetadataCatalog.get("pv_anomaly_train").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_val",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/val_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/val",
        "pv_anomaly_val"
    )
)
MetadataCatalog.get("pv_anomaly_val").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_test",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/test_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/test",
        "pv_anomaly_test"
    )
)
MetadataCatalog.get("pv_anomaly_test").set(thing_classes=["anomaly"])

# Retrieve metadata to ensure it is set correctly
pv_anomaly_metadata = MetadataCatalog.get("pv_anomaly_train")
pv_anomaly_metadata1 = MetadataCatalog.get("pv_anomaly_val")
print(pv_anomaly_metadata)
print(pv_anomaly_metadata1)

def set_multiprocessing_start_method():
try:
mp.set_start_method('spawn', force=True)
except RuntimeError as e:
if "context has already been set" in str(e):
print("Multiprocessing context already set, continuing without changing start method.")
else:
raise

class PrintMetricsHook(HookBase):
def init(self, cfg):
self.cfg = cfg

def after_step(self):
    # Every iteration
    iter_num = self.trainer.iter
    # Print the metrics for every iteration
    metrics = self.trainer.storage.latest()

    # Format and print the metrics
    print(f"\\nIteration {iter_num} Metrics:")
    print(f"{'-'*40}")

    for key, (value, _) in metrics.items():
        print(f"{key}: {value:.4f}")

    print(f"{'-'*40}")

class MyTrainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg, dataset_name):
return COCOEvaluator(dataset_name, cfg, False, output_dir=cfg.OUTPUT_DIR)

def main():
register_datasets()
set_multiprocessing_start_method()

# Get default config
cfg = get_cfg()

# Load Faster R-CNN with "ResNeXt-101-32x8d model trained with Caffe2 at FB" backbone pre-trained on COCO
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))

# Set training and validation datasets
cfg.DATASETS.TRAIN = ("pv_anomaly_train",)  # Training dataset
cfg.DATASETS.TEST = ("pv_anomaly_val",)     # Validation dataset

# Number of data loading workers
cfg.DATALOADER.NUM_WORKERS = 2

# Set weights for pre-trained model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")

# Number of images per batch
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduced batch size to fit into memory

# Base learning rate
cfg.SOLVER.BASE_LR = 0.00025

# Maximum number of iterations in detectron2, epoch is MAX_ITER * BATCH_SIZE / TOTAL_NUM_IMAGES
cfg.SOLVER.MAX_ITER = 3000

# ROI Heads batch size per image
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128

# Number of classes (in this case, only 1: 'anomaly')
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1


# Create output directory if it doesn't exist
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Set the maximum split size to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Create a MyTrainer instance
trainer = MyTrainer(cfg)

# Add the custom hook to print metrics after each iteration
trainer.register_hooks([PrintMetricsHook(cfg)])

# Resume training if checkpoint exists, otherwise start from scratch
trainer.resume_or_load(resume=False)

# Start training
trainer.train()

# Run evaluation
evaluator = COCOEvaluator("pv_anomaly_val", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "pv_anomaly_val")
inference_on_dataset(trainer.model, val_loader, evaluator)

if name == "main":
main()
"""

Write the script to a file

script_path = '/kaggle/working/train_yolo.py'
with open(script_path, 'w') as f:
f.write(script_content)

Define the command to run the training script using torch.distributed.run

train_command = f"""
python -m torch.distributed.run --nproc_per_node=2 {script_path}
"""

Execute the training command

os.system(train_command)

`
best regards!

Copy link

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template.
The following information is missing: "Instructions To Reproduce the Issue and Full Logs"; "Your Environment";

@github-actions github-actions bot added needs-more-info More info is needed to complete the issue and removed needs-more-info More info is needed to complete the issue labels Jun 21, 2024
@Programmer-RD-AI
Copy link
Contributor

Hi,
Check the Issues #2442 & #2473
Example on How to implement multi GPU training
Hope this helps, If there are further questions please feel free to comment :)
Best regards,
Ranuga

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants