Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LSTM node #729

Merged
merged 45 commits into from
Jul 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
4e8ceb2
first stab at lstm node, forward pass
msperber Jul 10, 2017
222b24e
added comment
msperber Jul 10, 2017
c4c654e
LSTM forward: memory layout of h/c vector + restructuring/fixes
msperber Jul 11, 2017
7b6d413
bugfix + prepared unit test
msperber Jul 12, 2017
e485905
summarized 4 matrix multiplication into one operation; forward comput…
msperber Jul 12, 2017
80e2d8e
forward tests passing
msperber Jul 12, 2017
fc55106
lstm node forward pass working with minibatches
msperber Jul 12, 2017
48732f3
lstm 3-part node: forward working & tested
msperber Jul 13, 2017
b368062
backward pass for vanilla_lstm_h and vanilla_lstm_c
msperber Jul 13, 2017
10c6a46
lstm-gates: parts of backward pass
msperber Jul 14, 2017
9efa45d
some more missing parts
msperber Jul 14, 2017
0cc6379
removed old code, cleaned up tests
msperber Jul 14, 2017
115211a
vanilla_lstm_c: backward passing test
msperber Jul 17, 2017
f03db7f
some testing / fixing
msperber Jul 18, 2017
9b1f2b4
some comments
msperber Jul 18, 2017
ab1ea1a
Merge branch 'master' into lstm-node
msperber Jul 18, 2017
fed3f30
CPU matrix multiply
msperber Jul 18, 2017
9688329
re-added LSTM node
msperber Jul 18, 2017
b217ce0
lstm gates fwd passing test
msperber Jul 18, 2017
a09b713
tests
msperber Jul 18, 2017
6ccefe4
lstm_h backward passing gradient checks
msperber Jul 19, 2017
59235b2
lstm gates backward: replaced contraction by matrix multiply
msperber Jul 19, 2017
f5ab4bf
lstm gates bwd implemented but not yet passing tests
msperber Jul 19, 2017
49fabe6
fixed math in lstm_h backward
msperber Jul 19, 2017
eefbe47
fixed the same math error in lstm_gates bwd
msperber Jul 19, 2017
f050747
improved tests; all tests passing
msperber Jul 19, 2017
2d8c184
unified CPUMatrixMultiply / CUDAMatrixMultiply -> MatrixMultiply
msperber Jul 19, 2017
5582778
python interface for LSTM node
msperber Jul 20, 2017
7a915a8
changed use of slice for potential speed improvement
msperber Jul 20, 2017
c4f2ac3
speed-up: replace .sum() by manual summation over batches
msperber Jul 20, 2017
6b37e7b
speed-up by replacing shuffle with reshape
msperber Jul 20, 2017
2093f77
speed by replacing shuffle with transpose
msperber Jul 20, 2017
e93b6d4
marked places that need speed-up
msperber Jul 20, 2017
6e516cc
speed up for outer product
msperber Jul 20, 2017
e6b90c5
Some fixes for GPU
neubig Jul 20, 2017
3e54f0b
Comment out extra scratch memory
neubig Jul 21, 2017
3d1bfe3
Merge pull request #1 from neubig/lstm-node
msperber Jul 21, 2017
182fb08
add CompactVanillaLSTMBuilder
msperber Jul 21, 2017
cc3739a
added weight norm to vanilla_lstm_gates
msperber Jul 21, 2017
fbd632a
updated doc & python interface
msperber Jul 21, 2017
1132bac
add weight noise to CompactVanillaLSTMBuilder
msperber Jul 21, 2017
6cd6ccc
added missing free() of scratch allocator
msperber Jul 21, 2017
e485713
initial code to integrate dropout into lstm nodes
msperber Jul 21, 2017
14d0af5
integrated dropout into LSTM node
msperber Jul 21, 2017
8648640
removed unused variables
msperber Jul 21, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dynet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ set(dynet_library_SRCS
nodes-linalg.cc
nodes-logsumexp.cc
nodes-losses.cc
nodes-lstm.cc
nodes-matrixmultiply.cc
nodes-maxpooling2d.cc
nodes-minmax.cc
Expand Down Expand Up @@ -72,7 +73,6 @@ set(dynet_library_HDRS
aligned-mem-pool.h
c2w.h
cfsm-builder.h
cuda-matrix-multiply.h
cuda.h
cudnn-ops.h
deep-lstm.h
Expand All @@ -96,6 +96,7 @@ set(dynet_library_HDRS
init.h
io.h
lstm.h
matrix-multiply.h
mem.h
model.h
nodes-contract.h
Expand Down Expand Up @@ -142,6 +143,7 @@ set(dynet_gpu_mergeable_SRCS
nodes-linalg
nodes-logsumexp
nodes-losses
nodes-lstm
nodes-matrixmultiply
nodes-maxpooling2d
nodes-minmax
Expand Down
43 changes: 0 additions & 43 deletions dynet/cuda-matrix-multiply.h

This file was deleted.

13 changes: 13 additions & 0 deletions dynet/expr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,17 @@ Expression layer_norm(const Expression& x, const Expression& g, const Expression

Expression weight_norm(const Expression& w, const Expression& g){return Expression(w.pg, w.pg->add_function<WeightNormalization>({w.i,g.i}));}

Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, real weightnoise_std){
return Expression(x_t.pg, x_t.pg->add_function<VanillaLSTMGates>({x_t.i, h_tm1.i, Wx.i, Wh.i, b.i}, weightnoise_std));
}
Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, const Expression& dropout_mask_x, const Expression& dropout_mask_h, real weightnoise_std){
return Expression(x_t.pg, x_t.pg->add_function<VanillaLSTMGates>({x_t.i, h_tm1.i, Wx.i, Wh.i, b.i, dropout_mask_x.i, dropout_mask_h.i}, weightnoise_std));
}
Expression vanilla_lstm_c(const Expression& c_tm1, const Expression& gates_t){
return Expression(c_tm1.pg, c_tm1.pg->add_function<VanillaLSTMC>({c_tm1.i, gates_t.i}));
}
Expression vanilla_lstm_h(const Expression& c_t, const Expression& gates_t){
return Expression(c_t.pg, c_t.pg->add_function<VanillaLSTMH>({c_t.i, gates_t.i}));
}

} // namespace dynet
66 changes: 66 additions & 0 deletions dynet/expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -2091,6 +2091,72 @@ Expression layer_norm(const Expression& x, const Expression& g, const Expression
*/
Expression weight_norm(const Expression& w, const Expression& g);

/**
* \ingroup lstm
* \brief Computes LSTM matrix multiplies plus nonlinearities
* \details Computes LSTM gates (matrix multiply + nonlinearities) as follows:
*
* gates_i = sigmoid (Wx_i * x_t + Wh_i * h_tm1 + b_i)
* gates_f = sigmoid (Wx_f * x_t + Wh_f * h_tm1 + b_f + 1)
* gates_o = sigmoid (Wx_o * x_t + Wh_o * h_tm1 + b_o)
* gates_g = tanh (Wx_g * x_t + Wh_g * h_tm1 + b_g)
*
* Where optionally gaussian noise with the given standard deviation is applied to Wx, Wh, b parameters.
*
* returns [gates_i]
* [gates_f]
* [gates_o]
* [gates_g]
*
*
* \param x_t Input at current timestep (vector size I)
* \param h_tm1 h of previous timestep
* \param Wx State previous timestep (vector size H)
* \param Wh Parameter matrix size 4H x I
* \param b Bias parameter size 4H
* \param weightnoise_std: apply gaussian noise to weights (Wx, Wh, b); requires only temporary additional memory
* \return An expression with dimensions 4H
*/
Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, real weightnoise_std=0.f);

/**
* \ingroup lstm
* \brief Computes LSTM matrix multiplies plus nonlinearities, while applying a dropout mask to input and previous state
* \param x_t Input at current timestep (vector size I)
* \param h_tm1 h of previous timestep
* \param Wx State previous timestep (vector size H)
* \param Wh Parameter matrix size 4H x I
* \param b Bias parameter size 4H
* \param dropout_mask_x Input dropout mask, size I
* \param dropout_mask_h Hidden state dropout mask, size H
* \param weightnoise_std: apply gaussian noise to weights (Wx, Wh, b); requires only temporary additional memory
* \return An expression with dimensions 4H
*/
Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, const Expression& dropout_mask_x, const Expression& dropout_mask_h, real weightnoise_std=0.f);

/**
* \ingroup lstm
* \brief Computes LSTM cell state
* \details Computes LSTM cell: c_t = gates_i . gates_g + gates_f . c_tm1
*
* \param c_tm1 Cell at previous timestep (vector size H)
* \param gates_t Gates at current timestep as computed by vanilla_lstm_gates (vector size 4H)
* \return Vector size H
*/
Expression vanilla_lstm_c(const Expression& c_tm1, const Expression& gates_t);

/**
* \ingroup lstm
* \brief Computes LSTM hidden state
* \details Computes LSTM output: h_t = o_t . tanh(c_t)
*
* \param c_t Cell at current timestep (vector size H)
* \param gates_t Gates at current timestep as computed by vanilla_lstm_gates (vector size 4H)
* \return Vector size H
*/

Expression vanilla_lstm_h(const Expression& c_t, const Expression& gates_t);

} // namespace dynet

#endif
197 changes: 197 additions & 0 deletions dynet/lstm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -529,4 +529,201 @@ void VanillaLSTMBuilder::disable_dropout() {
dropout_rate_h = 0.f;
}


CompactVanillaLSTMBuilder::CompactVanillaLSTMBuilder() : has_initial_state(false), layers(0), input_dim(0), hid(0), dropout_rate_h(0), weightnoise_std(0) { }

CompactVanillaLSTMBuilder::CompactVanillaLSTMBuilder(unsigned layers,
unsigned input_dim,
unsigned hidden_dim,
ParameterCollection& model)
: layers(layers), input_dim(input_dim), hid(hidden_dim), weightnoise_std(0){
unsigned layer_input_dim = input_dim;
local_model = model.add_subcollection("compact-vanilla-lstm-builder");
for (unsigned i = 0; i < layers; ++i) {
// i
Parameter p_Wx = local_model.add_parameters({hidden_dim * 4, layer_input_dim});
Parameter p_Wh = local_model.add_parameters({hidden_dim * 4, hidden_dim});
Parameter p_b = local_model.add_parameters({hidden_dim * 4}, ParameterInitConst(0.f));

layer_input_dim = hidden_dim; // output (hidden) from 1st layer is input to next

vector<Parameter> ps = {p_Wx, p_Wh, p_b};
params.push_back(ps);

} // layers
dropout_rate = 0.f;
dropout_rate_h = 0.f;
}

void CompactVanillaLSTMBuilder::new_graph_impl(ComputationGraph& cg, bool update) {
param_vars.clear();
for (unsigned i = 0; i < layers; ++i) {
auto& p = params[i];
vector<Expression> vars;
for (unsigned j = 0; j < p.size(); ++j) { vars.push_back(update ? parameter(cg, p[j]) : const_parameter(cg, p[j])); }
param_vars.push_back(vars);
}

_cg = &cg;
}
// layout: 0..layers = c
// layers+1..2*layers = h
void CompactVanillaLSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
h.clear();
c.clear();

if (hinit.size() > 0) {
DYNET_ARG_CHECK(layers * 2 == hinit.size(),
"CompactVanillaLSTMBuilder must be initialized with 2 times as many expressions as layers "
"(hidden state, and cell for each layer). However, for " << layers << " layers, " <<
hinit.size() << " expressions were passed in");
h0.resize(layers);
c0.resize(layers);
for (unsigned i = 0; i < layers; ++i) {
c0[i] = hinit[i];
h0[i] = hinit[i + layers];
}
has_initial_state = true;
} else {
has_initial_state = false;
}

// Init droupout masks
set_dropout_masks();
}

void CompactVanillaLSTMBuilder::set_dropout_masks(unsigned batch_size) {
masks.clear();
for (unsigned i = 0; i < layers; ++i) {
std::vector<Expression> masks_i;
unsigned idim = (i == 0) ? input_dim : hid;
if (dropout_rate > 0.f || dropout_rate_h > 0.f) {
float retention_rate = 1.f - dropout_rate;
float retention_rate_h = 1.f - dropout_rate_h;
float scale = 1.f / retention_rate;
float scale_h = 1.f / retention_rate_h;
// in
masks_i.push_back(random_bernoulli(*_cg, Dim({ idim}, batch_size), retention_rate, scale));
// h
masks_i.push_back(random_bernoulli(*_cg, Dim({ hid}, batch_size), retention_rate_h, scale_h));
masks.push_back(masks_i);
}
}
}

ParameterCollection & CompactVanillaLSTMBuilder::get_parameter_collection() {
return local_model;
}

// TODO - Make this correct
// Copied c from the previous step (otherwise c.size()< h.size())
// Also is creating a new step something we want?
// wouldn't overwriting the current one be better?
Expression CompactVanillaLSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
DYNET_ARG_CHECK(h_new.empty() || h_new.size() == layers,
"CompactVanillaLSTMBuilder::set_h expects as many inputs as layers, but got " <<
h_new.size() << " inputs for " << layers << " layers");
const unsigned t = h.size();
h.push_back(vector<Expression>(layers));
c.push_back(vector<Expression>(layers));
for (unsigned i = 0; i < layers; ++i) {
Expression h_i = h_new[i];
Expression c_i = c[t - 1][i];
h[t][i] = h_i;
c[t][i] = c_i;
}
return h[t].back();
}
// Current implementation : s_new is either {new_c[0],...,new_c[n]}
// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
Expression CompactVanillaLSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
DYNET_ARG_CHECK(s_new.size() == layers || s_new.size() == 2 * layers,
"CompactVanillaLSTMBuilder::set_s expects either as many inputs or twice as many inputs as layers, but got " << s_new.size() << " inputs for " << layers << " layers");
bool only_c = s_new.size() == layers;
const unsigned t = c.size();
h.push_back(vector<Expression>(layers));
c.push_back(vector<Expression>(layers));
for (unsigned i = 0; i < layers; ++i) {
Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
Expression c_i = s_new[i];
h[t][i] = h_i;
c[t][i] = c_i;
}
return h[t].back();
}

Expression CompactVanillaLSTMBuilder::add_input_impl(int prev, const Expression& x) {
h.push_back(vector<Expression>(layers));
c.push_back(vector<Expression>(layers));
vector<Expression>& ht = h.back();
vector<Expression>& ct = c.back();
Expression in = x;
for (unsigned i = 0; i < layers; ++i) {
const vector<Expression>& vars = param_vars[i];
Expression i_h_tm1, i_c_tm1;
if (prev < 0) {
if (has_initial_state) {
// initial value for h and c at timestep 0 in layer i
// defaults to zero matrix input if not set in add_parameter_edges
i_h_tm1 = h0[i];
i_c_tm1 = c0[i];
} else {
i_h_tm1 = zeroes(*_cg, Dim({vars[_BI].dim()[0]/4}, x.dim().bd));
i_c_tm1 = i_h_tm1;
}
} else { // t > 0
i_h_tm1 = h[prev][i];
i_c_tm1 = c[prev][i];
}
// TODO: could extend lstm nodes to takes several inputs that will be concatenated internally, would save memory by avoiding concatenate() operation for bidirectional LSTMs
// TODO: smaller speed / memory gains by making a version of the lstm gates that assume c or h inputs to be zero (for beginning of sequence)
if (dropout_rate_h > 0.f){
// apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
Expression gates_t = vanilla_lstm_gates(in, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], masks[i][0], masks[i][1], weightnoise_std);
ct[i] = vanilla_lstm_c(i_c_tm1, gates_t);
in = ht[i] = vanilla_lstm_h(ct[i], gates_t);
} else {
Expression gates_t = vanilla_lstm_gates(in, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], weightnoise_std);
ct[i] = vanilla_lstm_c(i_c_tm1, gates_t);
in = ht[i] = vanilla_lstm_h(ct[i], gates_t);
}
}
return ht.back();
}

void CompactVanillaLSTMBuilder::copy(const RNNBuilder & rnn) {
const CompactVanillaLSTMBuilder & rnn_lstm = (const CompactVanillaLSTMBuilder&)rnn;
DYNET_ARG_CHECK(params.size() == rnn_lstm.params.size(),
"Attempt to copy CompactVanillaLSTMBuilder with different number of parameters "
"(" << params.size() << " != " << rnn_lstm.params.size() << ")");
for (size_t i = 0; i < params.size(); ++i)
for (size_t j = 0; j < params[i].size(); ++j)
params[i][j] = rnn_lstm.params[i][j];
}

void CompactVanillaLSTMBuilder::set_dropout(float d) {
DYNET_ARG_CHECK(d >= 0.f && d <= 1.f,
"dropout rate must be a probability (>=0 and <=1)");
dropout_rate = d;
dropout_rate_h = d;
}

void CompactVanillaLSTMBuilder::set_dropout(float d, float d_h) {
DYNET_ARG_CHECK(d >= 0.f && d <= 1.f && d_h >= 0.f && d_h <= 1.f,
"dropout rate must be a probability (>=0 and <=1)");
dropout_rate = d;
dropout_rate_h = d_h;
}

void CompactVanillaLSTMBuilder::disable_dropout() {
dropout_rate = 0.f;
dropout_rate_h = 0.f;
}
void CompactVanillaLSTMBuilder::set_weightnoise(float std) {
DYNET_ARG_CHECK(std >= 0.f, "weight noise must have standard deviation >=0");
weightnoise_std = std;
}



} // namespace dynet
Loading