diff --git a/test/test_models.py b/test/test_models.py
index 3ec3ea9c..16379af3 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -108,7 +108,8 @@ def _init_test_yolo_head(self):
         in_channels = self._get_in_channels()
         num_anchors = self._get_num_anchors()
         num_classes = self._get_num_classes()
-        box_head = YoloHead(in_channels, num_anchors, num_classes)
+        strides = self._get_strides()
+        box_head = YoloHead(in_channels, num_anchors, strides, num_classes)
         return box_head
 
     def test_yolo_head(self):
diff --git a/yolort/models/box_head.py b/yolort/models/box_head.py
index 5742a991..f4d64886 100644
--- a/yolort/models/box_head.py
+++ b/yolort/models/box_head.py
@@ -1,4 +1,5 @@
 # Modified from ultralytics/yolov5 by Zhiqiang Wang
+import math
 import torch
 from torch import nn, Tensor
 
@@ -10,14 +11,31 @@
 
 
 class YoloHead(nn.Module):
-    def __init__(self, in_channels: List[int], num_anchors: int, num_classes: int):
+    def __init__(self, in_channels: List[int], num_anchors: int, strides: List[int], num_classes: int):
         super().__init__()
         self.num_anchors = num_anchors  # anchors
+        self.num_classes = num_classes
         self.num_outputs = num_classes + 5  # number of outputs per anchor
+        self.strides = strides
 
         self.head = nn.ModuleList(
             nn.Conv2d(ch, self.num_outputs * self.num_anchors, 1) for ch in in_channels)  # output conv
 
+        self._initialize_biases()  # Init weights, biases
+
+    def _initialize_biases(self, cf=None):
+        """
+        Initialize biases into YoloHead, cf is class frequency
+        Check section 3.3 in <https://arxiv.org/abs/1708.02002>
+        """
+        for mi, s in zip(self.head, self.strides):
+            b = mi.bias.view(self.num_anchors, -1)  # conv.bias(255) to (3,85)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s) ** 2)
+            # classes
+            b.data[:, 5:] += torch.log(cf / cf.sum()) if cf else math.log(0.6 / (self.num_classes - 0.99))
+            mi.bias = nn.Parameter(b.view(-1), requires_grad=True)
+
     def get_result_from_head(self, features: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.head[idx](features),
@@ -199,7 +217,8 @@ def assign_targets_to_anchors(
             # Append
             a = targets_with_gain[:, 6].long()  # anchor indices
             # image, anchor, grid indices
-            indices.append((bc[0], a, grid_ij[:, 1].clamp_(0, gain[3] - 1), grid_ij[:, 0].clamp_(0, gain[2] - 1)))
+            indices.append((bc[0], a, grid_ij[:, 1].clamp_(0, gain[3] - 1),
+                            grid_ij[:, 0].clamp_(0, gain[2] - 1)))
             targets_box.append(torch.cat((grid_xy - grid_ij, grid_wh), 1))  # box
             anchors_encode.append(anchors_per_layer[a])  # anchors
             targets_cls.append(bc[1])  # class
diff --git a/yolort/models/yolo.py b/yolort/models/yolo.py
index 166b86dc..0f00d5cd 100644
--- a/yolort/models/yolo.py
+++ b/yolort/models/yolo.py
@@ -56,6 +56,7 @@ def __init__(
             head = YoloHead(
                 backbone.out_channels,
                 anchor_generator.num_anchors,
+                anchor_generator.strides,
                 num_classes,
             )
         self.head = head