You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, am trying to use multi-GPU training using kaggle with two Tesla T4.
my code only runs on 1 GPU, the other are not utilized.
I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.
i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"
Full runnable code or full changes you made:
`
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
Define the training script content
script_content = """
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
Unregister the datasets if they are already registered
for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]:
if d in DatasetCatalog.list():
DatasetCatalog.remove(d)
if d in MetadataCatalog.list():
MetadataCatalog.remove(d)
def load_coco_json(json_file, image_root, dataset_name):
with open(json_file) as f:
imgs_anns = json.load(f)
dataset_dicts = []
for img_ann in imgs_anns["images"]:
record = {}
record["file_name"] = os.path.join(image_root, img_ann["file_name"])
record["image_id"] = img_ann["id"]
record["height"] = img_ann["height"]
record["width"] = img_ann["width"]
objs = []
for ann in imgs_anns["annotations"]:
if ann["image_id"] != img_ann["id"]:
continue
obj = {
"bbox": ann["bbox"],
"bbox_mode": BoxMode.XYWH_ABS,
"category_id": ann["category_id"] - 1, # Subtract 1 to make the category_id 0-based
"iscrowd": ann["iscrowd"]
}
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts
DatasetCatalog.register(
"pv_anomaly_val",
lambda: load_coco_json(
"/kaggle/working/0PVProjects/Univpm_DataSet/labels/val_annotations.json",
"/kaggle/working/0PVProjects/Univpm_DataSet/images/val",
"pv_anomaly_val"
)
)
MetadataCatalog.get("pv_anomaly_val").set(thing_classes=["anomaly"])
DatasetCatalog.register(
"pv_anomaly_test",
lambda: load_coco_json(
"/kaggle/working/0PVProjects/Univpm_DataSet/labels/test_annotations.json",
"/kaggle/working/0PVProjects/Univpm_DataSet/images/test",
"pv_anomaly_test"
)
)
MetadataCatalog.get("pv_anomaly_test").set(thing_classes=["anomaly"])
# Retrieve metadata to ensure it is set correctly
pv_anomaly_metadata = MetadataCatalog.get("pv_anomaly_train")
pv_anomaly_metadata1 = MetadataCatalog.get("pv_anomaly_val")
print(pv_anomaly_metadata)
print(pv_anomaly_metadata1)
def set_multiprocessing_start_method():
try:
mp.set_start_method('spawn', force=True)
except RuntimeError as e:
if "context has already been set" in str(e):
print("Multiprocessing context already set, continuing without changing start method.")
else:
raise
class PrintMetricsHook(HookBase):
def init(self, cfg):
self.cfg = cfg
def after_step(self):
# Every iteration
iter_num = self.trainer.iter
# Print the metrics for every iteration
metrics = self.trainer.storage.latest()
# Format and print the metrics
print(f"\\nIteration {iter_num} Metrics:")
print(f"{'-'*40}")
for key, (value, _) in metrics.items():
print(f"{key}: {value:.4f}")
print(f"{'-'*40}")
# Get default config
cfg = get_cfg()
# Load Faster R-CNN with "ResNeXt-101-32x8d model trained with Caffe2 at FB" backbone pre-trained on COCO
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
# Set training and validation datasets
cfg.DATASETS.TRAIN = ("pv_anomaly_train",) # Training dataset
cfg.DATASETS.TEST = ("pv_anomaly_val",) # Validation dataset
# Number of data loading workers
cfg.DATALOADER.NUM_WORKERS = 2
# Set weights for pre-trained model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
# Number of images per batch
cfg.SOLVER.IMS_PER_BATCH = 2 # Reduced batch size to fit into memory
# Base learning rate
cfg.SOLVER.BASE_LR = 0.00025
# Maximum number of iterations in detectron2, epoch is MAX_ITER * BATCH_SIZE / TOTAL_NUM_IMAGES
cfg.SOLVER.MAX_ITER = 3000
# ROI Heads batch size per image
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
# Number of classes (in this case, only 1: 'anomaly')
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
# Create output directory if it doesn't exist
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
# Set the maximum split size to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
# Create a MyTrainer instance
trainer = MyTrainer(cfg)
# Add the custom hook to print metrics after each iteration
trainer.register_hooks([PrintMetricsHook(cfg)])
# Resume training if checkpoint exists, otherwise start from scratch
trainer.resume_or_load(resume=False)
# Start training
trainer.train()
# Run evaluation
evaluator = COCOEvaluator("pv_anomaly_val", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "pv_anomaly_val")
inference_on_dataset(trainer.model, val_loader, evaluator)
if name == "main":
main()
"""
Write the script to a file
script_path = '/kaggle/working/train_yolo.py'
with open(script_path, 'w') as f:
f.write(script_content)
Define the command to run the training script using torch.distributed.run
You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template.
The following information is missing: "Instructions To Reproduce the Issue and Full Logs"; "Your Environment";
Hi, am trying to use multi-GPU training using kaggle with two Tesla T4.
my code only runs on 1 GPU, the other are not utilized.
I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.
i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"
Full runnable code or full changes you made:
`
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
Define the training script content
script_content = """
import os
import json
import multiprocessing as mp
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
Unregister the datasets if they are already registered
for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]:
if d in DatasetCatalog.list():
DatasetCatalog.remove(d)
if d in MetadataCatalog.list():
MetadataCatalog.remove(d)
def load_coco_json(json_file, image_root, dataset_name):
with open(json_file) as f:
imgs_anns = json.load(f)
def register_datasets():
DatasetCatalog.register(
"pv_anomaly_train",
lambda: load_coco_json(
"/kaggle/working/0PVProjects/Univpm_DataSet/labels/train_annotations.json",
"/kaggle/working/0PVProjects/Univpm_DataSet/images/train_combined_data",
"pv_anomaly_train"
)
)
MetadataCatalog.get("pv_anomaly_train").set(thing_classes=["anomaly"])
def set_multiprocessing_start_method():
try:
mp.set_start_method('spawn', force=True)
except RuntimeError as e:
if "context has already been set" in str(e):
print("Multiprocessing context already set, continuing without changing start method.")
else:
raise
class PrintMetricsHook(HookBase):
def init(self, cfg):
self.cfg = cfg
class MyTrainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg, dataset_name):
return COCOEvaluator(dataset_name, cfg, False, output_dir=cfg.OUTPUT_DIR)
def main():
register_datasets()
set_multiprocessing_start_method()
if name == "main":
main()
"""
Write the script to a file
script_path = '/kaggle/working/train_yolo.py'
with open(script_path, 'w') as f:
f.write(script_content)
Define the command to run the training script using torch.distributed.run
train_command = f"""
python -m torch.distributed.run --nproc_per_node=2 {script_path}
"""
Execute the training command
os.system(train_command)
`
best regards!
The text was updated successfully, but these errors were encountered: