Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Layer Normalization #2213

Merged
merged 13 commits into from
Oct 20, 2020
171 changes: 171 additions & 0 deletions dlib/cuda/cpu_dlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,177 @@ namespace dlib
}
}

// -----------------------------------------------------------------------------------

void layer_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(
have_same_dimensions(gamma, beta) &&
src.num_samples() == gamma.size() &&
src.num_samples() == beta.size() &&
eps > 0,
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);

dest.copy_size(src);
means.set_size(src.num_samples());
invstds.set_size(src.num_samples());

// first compute means and invstds
means = 0;
invstds = 0;
const auto p_invstds = invstds.host();
const auto p_means = means.host();
auto p_src = src.host();
// compute means, and sum of squares
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
float val = p_src[n*num+i];
p_means[n] += val;
p_invstds[n] += val*val;
}
}
means /= num;
invstds /= num;
// copy data back to host
invstds.host(); means.host();

// compute variances
for (long n = 0; n < src.num_samples(); ++n)
{
auto var = p_invstds[n] - p_means[n] * p_means[n];
p_invstds[n] = 1.0f / std::sqrt(var + eps);
}

p_src = src.host();
auto p_dest = dest.host();
auto p_gamma = gamma.host();
auto p_beta = beta.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
*p_dest = (*p_src - p_means[n])*p_invstds[n];
*p_dest = (*p_dest)*p_gamma[n] + p_beta[n];
++p_src;
++p_dest;
}
}
}

void layer_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(src.num_samples() == means.size());
DLIB_CASSERT(src.num_samples() == invstds.size());
DLIB_CASSERT(src.num_samples() == gamma.size());
DLIB_CASSERT(src.num_samples() == gamma_grad.size());
DLIB_CASSERT(src.num_samples() == beta_grad.size());
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(eps > 0);

beta_grad = 0;
gamma_grad = 0;
auto p_grad = gradient_input.host();
auto p_src = src.host();
const auto p_gamma = gamma.host();
const auto p_gamma_grad = gamma_grad.host();
const auto p_beta_grad = beta_grad.host();
const auto p_invstds = invstds.host();
const auto p_means = means.host();

resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
const auto p_dvars = dvars.host();
const auto p_dmeans = dmeans.host();

for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float x_hat = (*p_src - p_means[n])*p_invstds[n];
p_beta_grad[n] += *p_grad;
p_gamma_grad[n] += (*p_grad)*x_hat;

const float dx = *p_grad * p_gamma[n];

p_dvars[n] += dx*(*p_src - p_means[n])*-0.5*std::pow(p_invstds[n], 3.0f);

++p_grad;
++p_src;
}
}

const float invnum = 1.0f/num;
p_grad = gradient_input.host();
p_src = src.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[n];

p_dmeans[n] += dx*-p_invstds[n] + p_dvars[n] * -2*(*p_src - p_means[n])*invnum;

++p_grad;
++p_src;
}
}
p_grad = gradient_input.host();
p_src = src.host();
auto p_src_grad = src_grad.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[n];

*p_src_grad += dx*p_invstds[n] +
p_dvars[n] *2*(*p_src - p_means[n])*invnum +
p_dmeans[n]*invnum;


++p_grad;
++p_src;
++p_src_grad;
}
}
}

// -----------------------------------------------------------------------------------

void threshold (
Expand Down
24 changes: 24 additions & 0 deletions dlib/cuda/cpu_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,30 @@ namespace dlib
tensor& beta_grad
);

// -----------------------------------------------------------------------------------

void layer_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const tensor& src,
const tensor& gamma,
const tensor& beta
);

void layer_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);

// -----------------------------------------------------------------------------------

void threshold (
Expand Down
163 changes: 163 additions & 0 deletions dlib/cuda/cuda_dlib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1749,6 +1749,169 @@ namespace dlib
}
}

// ----------------------------------------------------------------------------------------

__global__ void _cuda_layer_normalize(float* out, const float* s, float* m, float* v, const float* g, const float* b, float eps, size_t ns, size_t num)
{
// compute means and sum of squares
for (auto n : grid_stride_range_y(0, ns))
{
auto p = s + n * num;
float means = 0;
float invstds = 0;
for (auto i : grid_stride_range(0, num))
{
means += p[i];
invstds += p[i] * p[i];
}
warp_reduce_atomic_add(m[n], means/num);
warp_reduce_atomic_add(v[n], invstds/num);
}
__syncthreads();

// compute variances
for (auto n : grid_stride_range_y(0, ns))
{
for (auto i : grid_stride_range(0, 1))
{
auto var = v[n] - m[n] * m[n];
v[n] = 1.0f / std::sqrt(var + eps);
}
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
for (auto i : grid_stride_range(0, num))
{
const float val = (s[n*num+i]-m[n])*v[n];
out[n*num+i] = val*g[n]+b[n];
}
}
}

__global__ void _cuda_layer_normalize_gradient(float* out, float* gg, float* bg, const float* s, const float* gi, const float* m, const float* v, const float* g, float* dm, float* dv, float eps, size_t ns, size_t num)
{
for (auto n : grid_stride_range_y(0, ns))
{
float temp_bg = 0;
float temp_gg = 0;
float temp_dv = 0;
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float x_hat = (s[idx] - m[n])*v[n];
temp_bg += gi[idx];
temp_gg += gi[idx]*x_hat;

const float dx = gi[idx] * g[n];
temp_dv += dx*(s[idx] - m[n])*-0.5*v[n]*v[n]*v[n];
}
warp_reduce_atomic_add(bg[n], temp_bg);
warp_reduce_atomic_add(gg[n], temp_gg);
warp_reduce_atomic_add(dv[n], temp_dv);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
float temp_dm = 0;
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float dx = gi[idx]*g[n];
temp_dm += dx*-v[n] + dv[n] * -2*(s[idx] - m[n])/num;
// dm[n] += dx*-v[n] + dv[n] * -2*(s[idx] - m[n])/num;
}
warp_reduce_atomic_add(dm[n], temp_dm);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
float temp = 0;
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float dx = gi[idx]*g[n];
out[idx] += dx*v[n] + dv[n] * 2*(s[idx] - m[n])/num + dm[n]/num;
// temp += dx*v[n] + dv[n] * 2*(s[idx] - m[n])/num + dm[n]/num;
}
}
}

void layer_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(
have_same_dimensions(gamma, beta) &&
src.num_samples() == gamma.size() &&
src.num_samples() == beta.size() &&
eps > 0,
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);

dest.copy_size(src);
means.set_size(src.num_samples());
invstds.set_size(src.num_samples());
means = 0;
invstds = 0;
launch_kernel(_cuda_layer_normalize, max_jobs(num, src.num_samples()), dest.device(), src.device(),
means.device(), invstds.device(), gamma.device(), beta.device(), eps, src.num_samples(), num);
}

void layer_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(src.num_samples() == means.size());
DLIB_CASSERT(src.num_samples() == invstds.size());
DLIB_CASSERT(src.num_samples() == gamma.size());
DLIB_CASSERT(src.num_samples() == gamma_grad.size());
DLIB_CASSERT(src.num_samples() == beta_grad.size());
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(eps > 0);

beta_grad = 0;
gamma_grad = 0;
resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
launch_kernel(_cuda_layer_normalize_gradient, max_jobs(num, src.num_samples()),
src_grad.device(), gamma_grad.device(), beta_grad.device(), src.device(),
gradient_input.device(), means.device(), invstds.device(), gamma.device(),
dmeans.device(), dvars.device(), eps, src.num_samples(), num);
}

// ----------------------------------------------------------------------------------------

__global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
Expand Down
Loading