Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cherry-pick]to Release/2.3, Improve MSRAInitializer #43721

Merged
merged 2 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 45 additions & 32 deletions python/paddle/fluid/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,20 +679,23 @@ class MSRAInitializer(Initializer):

.. math::

x = \sqrt{\\frac{6.0}{fan\_in}}
x = gain \times \sqrt{\frac{3}{fan\_in}}

In case of Normal distribution, the mean is 0 and the standard deviation
is

.. math::

\sqrt{\\frac{2.0}{fan\_in}}
\frac{gain}{\sqrt{{fan\_in}}}

Args:
uniform (bool): whether to use uniform or normal distribution
fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\
inferred from the variable. default is None.
fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
seed (int32): random seed
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.

Note:
It is recommended to set fan_in to None for most cases.
Expand All @@ -709,7 +712,12 @@ class MSRAInitializer(Initializer):

"""

def __init__(self, uniform=True, fan_in=None, seed=0):
def __init__(self,
uniform=True,
fan_in=None,
seed=0,
negative_slope=0,
nonlinearity='relu'):
"""Constructor for MSRAInitializer
"""
assert uniform is not None
Expand All @@ -718,6 +726,8 @@ def __init__(self, uniform=True, fan_in=None, seed=0):
self._uniform = uniform
self._fan_in = fan_in
self._seed = seed
self._negative_slope = negative_slope
self._nonlinearity = nonlinearity

def __call__(self, var, block=None):
"""Initialize the input tensor with MSRA initialization.
Expand Down Expand Up @@ -759,13 +769,16 @@ def __call__(self, var, block=None):

if framework._non_static_mode():
if self._uniform:
limit = np.sqrt(6.0 / float(fan_in))
gain = calculate_gain(self._nonlinearity, self._negative_slope)
limit = gain * math.sqrt(3.0 / float(fan_in))

out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
-limit, 'max', limit, 'seed',
self._seed, 'dtype',
int(out_dtype))
else:
std = math.sqrt(2.0 / float(fan_in))
gain = calculate_gain(self._nonlinearity, self._negative_slope)
std = gain / math.sqrt(float(fan_in))
if in_dygraph_mode():
place = _current_expected_place()
out_var = _C_ops.final_state_gaussian_random(
Expand All @@ -786,33 +799,33 @@ def __call__(self, var, block=None):
return None
else:
if self._uniform:
limit = np.sqrt(6.0 / float(fan_in))
op = block.append_op(
type="uniform_random",
inputs={},
outputs={"Out": out_var},
attrs={
"shape": out_var.shape,
"dtype": int(out_dtype),
"min": -limit,
"max": limit,
"seed": self._seed
},
stop_gradient=True)
gain = calculate_gain(self._nonlinearity, self._negative_slope)
limit = gain * math.sqrt(3.0 / float(fan_in))
op = block.append_op(type="uniform_random",
inputs={},
outputs={"Out": out_var},
attrs={
"shape": out_var.shape,
"dtype": int(out_dtype),
"min": -limit,
"max": limit,
"seed": self._seed
},
stop_gradient=True)

else:
std = np.sqrt(2.0 / float(fan_in))
op = block.append_op(
type="gaussian_random",
outputs={"Out": out_var},
attrs={
"shape": out_var.shape,
"dtype": int(out_dtype),
"mean": 0.0,
"std": std,
"seed": self._seed
},
stop_gradient=True)
gain = calculate_gain(self._nonlinearity, self._negative_slope)
std = gain / math.sqrt(float(fan_in))
op = block.append_op(type="gaussian_random",
outputs={"Out": out_var},
attrs={
"shape": out_var.shape,
"dtype": int(out_dtype),
"mean": 0.0,
"std": std,
"seed": self._seed
},
stop_gradient=True)

if var.dtype == VarDesc.VarType.FP16 or (
var.dtype == VarDesc.VarType.BF16 and not self._uniform):
Expand Down
36 changes: 24 additions & 12 deletions python/paddle/nn/initializer/kaiming.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer):

.. math::

\sqrt{\frac{2.0}{fan\_in}}
\frac{gain}{\sqrt{{fan\_in}}}

Args:
fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
inferred from the variable. default is None.
fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.

Note:
It is recommended to set fan_in to None for most cases.
Expand All @@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer):

"""

def __init__(self, fan_in=None):
super(KaimingNormal, self).__init__(
uniform=False, fan_in=fan_in, seed=0)
def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
super(KaimingNormal, self).__init__(uniform=False,
fan_in=fan_in,
seed=0,
negative_slope=negative_slope,
nonlinearity=nonlinearity)


class KaimingUniform(MSRAInitializer):
Expand All @@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer):

.. math::

x = \sqrt{\frac{6.0}{fan\_in}}
x = gain \times \sqrt{\frac{3}{fan\_in}}

Args:
fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
inferred from the variable. default is None.
fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.

Note:
It is recommended to set fan_in to None for most cases.
Expand All @@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer):

"""

def __init__(self, fan_in=None):
super(KaimingUniform, self).__init__(
uniform=True, fan_in=fan_in, seed=0)
def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
super(KaimingUniform, self).__init__(uniform=True,
fan_in=fan_in,
seed=0,
negative_slope=negative_slope,
nonlinearity=nonlinearity)