Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics #1169

Closed
wants to merge 2 commits into from
Closed

Metrics #1169

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion optimum/onnxruntime/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,12 @@ def _inner_training_loop(
rng_to_sync = True

step = -1
avg_fwbw = 0.0
avg_optm = 0.0
avg_total = 0.0
avg = 0.0
for step, inputs in enumerate(train_dataloader):
iteration_start = time.time()
total_batched_samples += 1
if rng_to_sync:
self._load_rng_state(resume_from_checkpoint)
Expand All @@ -735,6 +740,7 @@ def _inner_training_loop(
if step % args.gradient_accumulation_steps == 0:
self.control = self.callback_handler.on_step_begin(args, self.state, self.control)

start = time.time()
if (
(total_batched_samples % args.gradient_accumulation_steps != 0)
and args.local_rank != -1
Expand All @@ -757,8 +763,16 @@ def _inner_training_loop(
tr_loss += tr_loss_step

self.current_flos += float(self.floating_point_ops(inputs))

step_time = (time.time() - start) * 1000
#if args.local_rank == 0:
# print(f"FWBW {step}: {step_time:.5f} ms")
avg_fwbw += step_time
if step >= steps_in_epoch // 2:
avg += step_time

# Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
start = time.time()
if self.deepspeed:
self.deepspeed.step()

Expand Down Expand Up @@ -817,8 +831,25 @@ def _inner_training_loop(
else:
self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

step_time = (time.time() - start) * 1000
#if args.local_rank == 0:
# print(f"optimizer {step}: {step_time:.5f} ms")
avg_optm += step_time

step_time = (time.time() - iteration_start) * 1000
#if args.local_rank == 0:
# print(f"iteration {step}: {step_time:.5f} ms")
avg_total += step_time

if self.control.should_epoch_stop or self.control.should_training_stop:
break

if args.local_rank == 0:
print(f"Avg of 2nd half: {(avg / (steps_in_epoch - steps_in_epoch // 2)):.5f} ms")
print(f"Avg of FW+BW: {(avg_fwbw / steps_in_epoch):.5f} ms")
print(f"Avg of optimizer: {(avg_optm / steps_in_epoch):.5f} ms")
print(f"Avg of iteration: {(avg_total / steps_in_epoch):.5f} ms")

if step < 0:
logger.warning(
f"There seems to be not a single sample in your train dataloader, stopping training at step"
Expand Down Expand Up @@ -1843,4 +1874,4 @@ def get_ort_optimizer_cls_and_kwargs(args: ORTTrainingArguments) -> Tuple[Any, A
)
else:
raise ValueError(f"ORTTrainer cannot instantiate unsupported optimizer: {args.optim}")
return optimizer_cls, optimizer_kwargs
return optimizer_cls, optimizer_kwargs
Loading