clab · neubig · Jul 24, 2017 · Jul 10, 2017 · Jul 10, 2017 · Jul 11, 2017
diff --git a/dynet/CMakeLists.txt b/dynet/CMakeLists.txt
@@ -39,6 +39,7 @@ set(dynet_library_SRCS
     nodes-linalg.cc
     nodes-logsumexp.cc
     nodes-losses.cc
+    nodes-lstm.cc
     nodes-matrixmultiply.cc
     nodes-maxpooling2d.cc
     nodes-minmax.cc
@@ -72,7 +73,6 @@ set(dynet_library_HDRS
     aligned-mem-pool.h
     c2w.h
     cfsm-builder.h
-    cuda-matrix-multiply.h
     cuda.h
     cudnn-ops.h
     deep-lstm.h
@@ -96,6 +96,7 @@ set(dynet_library_HDRS
     init.h
     io.h
     lstm.h
+    matrix-multiply.h
     mem.h
     model.h
     nodes-contract.h
@@ -142,6 +143,7 @@ set(dynet_gpu_mergeable_SRCS
     nodes-linalg
     nodes-logsumexp
     nodes-losses
+    nodes-lstm
     nodes-matrixmultiply
     nodes-maxpooling2d
     nodes-minmax

diff --git a/dynet/cuda-matrix-multiply.h b/dynet/cuda-matrix-multiply.h
diff --git a/dynet/expr.cc b/dynet/expr.cc
@@ -197,4 +197,17 @@ Expression layer_norm(const Expression& x, const Expression& g, const Expression
 
 Expression weight_norm(const Expression& w, const Expression& g){return Expression(w.pg, w.pg->add_function<WeightNormalization>({w.i,g.i}));}
 
+Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, real weightnoise_std){
+  return Expression(x_t.pg, x_t.pg->add_function<VanillaLSTMGates>({x_t.i, h_tm1.i, Wx.i, Wh.i, b.i}, weightnoise_std));
+}
+Expression vanilla_lstm_gates(const Expression& x_t, const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, const Expression& dropout_mask_x, const Expression& dropout_mask_h, real weightnoise_std){
+  return Expression(x_t.pg, x_t.pg->add_function<VanillaLSTMGates>({x_t.i, h_tm1.i, Wx.i, Wh.i, b.i, dropout_mask_x.i, dropout_mask_h.i}, weightnoise_std));
+}
+Expression vanilla_lstm_c(const Expression& c_tm1, const Expression& gates_t){
+  return Expression(c_tm1.pg, c_tm1.pg->add_function<VanillaLSTMC>({c_tm1.i, gates_t.i}));
+}
+Expression vanilla_lstm_h(const Expression& c_t, const Expression& gates_t){
+  return Expression(c_t.pg, c_t.pg->add_function<VanillaLSTMH>({c_t.i, gates_t.i}));
+}
+
 }  // namespace dynet
diff --git a/dynet/expr.h b/dynet/expr.h
@@ -2091,6 +2091,72 @@ Expression layer_norm(const Expression& x, const Expression& g, const Expression
  */
 Expression weight_norm(const Expression& w, const Expression& g);
 
+/**
+ * \ingroup lstm
+ * \brief Computes LSTM matrix multiplies plus nonlinearities
+ * \details Computes LSTM gates (matrix multiply + nonlinearities) as follows:
+ *
+ *     gates_i = sigmoid (Wx_i * x_t + Wh_i * h_tm1 + b_i)
+ *     gates_f = sigmoid (Wx_f * x_t + Wh_f * h_tm1 + b_f + 1)
+ *     gates_o = sigmoid (Wx_o * x_t + Wh_o * h_tm1 + b_o)
+ *     gates_g =   tanh  (Wx_g * x_t + Wh_g * h_tm1 + b_g)
+ *
+ *     Where optionally gaussian noise with the given standard deviation is applied to Wx, Wh, b parameters.
+ *
+ *     returns [gates_i]
+ *             [gates_f]
+ *             [gates_o]
+ *             [gates_g]
+ *
+ *
+ * \param x_t Input at current timestep (vector size I)
+ * \param h_tm1 h of previous timestep
+ * \param Wx State previous timestep (vector size H)
+ * \param Wh Parameter matrix size 4H x I
+ * \param b Bias parameter size 4H
+ * \param weightnoise_std: apply gaussian noise to weights (Wx, Wh, b); requires only temporary additional memory
+ * \return An expression with dimensions 4H
+ */
+Expression vanilla_lstm_gates(const Expression& x_t,  const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, real weightnoise_std=0.f);
+
+/**
+ * \ingroup lstm
+ * \brief Computes LSTM matrix multiplies plus nonlinearities, while applying a dropout mask to input and previous state
+ * \param x_t Input at current timestep (vector size I)
+ * \param h_tm1 h of previous timestep
+ * \param Wx State previous timestep (vector size H)
+ * \param Wh Parameter matrix size 4H x I
+ * \param b Bias parameter size 4H
+ * \param dropout_mask_x Input dropout mask, size I
+ * \param dropout_mask_h Hidden state dropout mask, size H
+ * \param weightnoise_std: apply gaussian noise to weights (Wx, Wh, b); requires only temporary additional memory
+ * \return An expression with dimensions 4H
+ */
+Expression vanilla_lstm_gates(const Expression& x_t,  const Expression& h_tm1, const Expression& Wx, const Expression& Wh, const Expression& b, const Expression& dropout_mask_x, const Expression& dropout_mask_h, real weightnoise_std=0.f);
+
+/**
+ * \ingroup lstm
+ * \brief Computes LSTM cell state
+ * \details Computes LSTM cell: c_t = gates_i . gates_g + gates_f . c_tm1
+ *
+ * \param c_tm1 Cell at previous timestep (vector size H)
+ * \param gates_t Gates at current timestep as computed by vanilla_lstm_gates (vector size 4H)
+ * \return Vector size H
+ */
+Expression vanilla_lstm_c(const Expression& c_tm1, const Expression& gates_t);
+
+/**
+ * \ingroup lstm
+ * \brief Computes LSTM hidden state
+ * \details Computes LSTM output: h_t = o_t . tanh(c_t)
+ *
+ * \param c_t Cell at current timestep (vector size H)
+ * \param gates_t Gates at current timestep as computed by vanilla_lstm_gates (vector size 4H)
+ * \return Vector size H
+ */
+
+Expression vanilla_lstm_h(const Expression& c_t, const Expression& gates_t);
+
 }  // namespace dynet
 
 #endif
diff --git a/dynet/lstm.cc b/dynet/lstm.cc
@@ -529,4 +529,201 @@ void VanillaLSTMBuilder::disable_dropout() {
   dropout_rate_h = 0.f;
 }
 
+
+CompactVanillaLSTMBuilder::CompactVanillaLSTMBuilder() : has_initial_state(false), layers(0), input_dim(0), hid(0), dropout_rate_h(0), weightnoise_std(0) { }
+
+CompactVanillaLSTMBuilder::CompactVanillaLSTMBuilder(unsigned layers,
+						     unsigned input_dim,
+						     unsigned hidden_dim,
+						     ParameterCollection& model)
+	    : layers(layers), input_dim(input_dim), hid(hidden_dim), weightnoise_std(0){
+  unsigned layer_input_dim = input_dim;
+  local_model = model.add_subcollection("compact-vanilla-lstm-builder");
+  for (unsigned i = 0; i < layers; ++i) {
+    // i
+    Parameter p_Wx = local_model.add_parameters({hidden_dim * 4, layer_input_dim});
+    Parameter p_Wh = local_model.add_parameters({hidden_dim * 4, hidden_dim});
+    Parameter p_b = local_model.add_parameters({hidden_dim * 4}, ParameterInitConst(0.f));
+
+    layer_input_dim = hidden_dim;  // output (hidden) from 1st layer is input to next
+
+    vector<Parameter> ps = {p_Wx, p_Wh, p_b};
+    params.push_back(ps);
+
+  }  // layers
+  dropout_rate = 0.f;
+  dropout_rate_h = 0.f;
+}
+
+void CompactVanillaLSTMBuilder::new_graph_impl(ComputationGraph& cg, bool update) {
+  param_vars.clear();
+  for (unsigned i = 0; i < layers; ++i) {
+    auto& p = params[i];
+    vector<Expression> vars;
+    for (unsigned j = 0; j < p.size(); ++j) { vars.push_back(update ? parameter(cg, p[j]) : const_parameter(cg, p[j])); }
+    param_vars.push_back(vars);
+  }
+
+  _cg = &cg;
+}
+// layout: 0..layers = c
+//         layers+1..2*layers = h
+void CompactVanillaLSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
+  h.clear();
+  c.clear();
+
+  if (hinit.size() > 0) {
+    DYNET_ARG_CHECK(layers * 2 == hinit.size(),
+                            "CompactVanillaLSTMBuilder must be initialized with 2 times as many expressions as layers "
+                            "(hidden state, and cell for each layer). However, for " << layers << " layers, " <<
+                            hinit.size() << " expressions were passed in");
+    h0.resize(layers);
+    c0.resize(layers);
+    for (unsigned i = 0; i < layers; ++i) {
+      c0[i] = hinit[i];
+      h0[i] = hinit[i + layers];
+    }
+    has_initial_state = true;
+  } else {
+    has_initial_state = false;
+  }
+
+  // Init droupout masks
+  set_dropout_masks();
+}
+
+void CompactVanillaLSTMBuilder::set_dropout_masks(unsigned batch_size) {
+  masks.clear();
+  for (unsigned i = 0; i < layers; ++i) {
+    std::vector<Expression> masks_i;
+    unsigned idim = (i == 0) ? input_dim : hid;
+    if (dropout_rate > 0.f || dropout_rate_h > 0.f) {
+      float retention_rate = 1.f - dropout_rate;
+      float retention_rate_h = 1.f - dropout_rate_h;
+      float scale = 1.f / retention_rate;
+      float scale_h = 1.f / retention_rate_h;
+      // in
+      masks_i.push_back(random_bernoulli(*_cg, Dim({ idim}, batch_size), retention_rate, scale));
+      // h
+      masks_i.push_back(random_bernoulli(*_cg, Dim({ hid}, batch_size), retention_rate_h, scale_h));
+      masks.push_back(masks_i);
+    }
+  }
+}
+
+ParameterCollection & CompactVanillaLSTMBuilder::get_parameter_collection() {
+  return local_model;
+}
+
+// TODO - Make this correct
+// Copied c from the previous step (otherwise c.size()< h.size())
+// Also is creating a new step something we want?
+// wouldn't overwriting the current one be better?
+Expression CompactVanillaLSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
+  DYNET_ARG_CHECK(h_new.empty() || h_new.size() == layers,
+                          "CompactVanillaLSTMBuilder::set_h expects as many inputs as layers, but got " <<
+                          h_new.size() << " inputs for " << layers << " layers");
+  const unsigned t = h.size();
+  h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
+  for (unsigned i = 0; i < layers; ++i) {
+    Expression h_i = h_new[i];
+    Expression c_i = c[t - 1][i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
+  }
+  return h[t].back();
+}
+// Current implementation : s_new is either {new_c[0],...,new_c[n]}
+// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
+Expression CompactVanillaLSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  DYNET_ARG_CHECK(s_new.size() == layers || s_new.size() == 2 * layers,
+                          "CompactVanillaLSTMBuilder::set_s expects either as many inputs or twice as many inputs as layers, but got " << s_new.size() << " inputs for " << layers << " layers");
+  bool only_c = s_new.size() == layers;
+  const unsigned t = c.size();
+  h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
+  for (unsigned i = 0; i < layers; ++i) {
+    Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
+    Expression c_i = s_new[i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
+  }
+  return h[t].back();
+}
+
+Expression CompactVanillaLSTMBuilder::add_input_impl(int prev, const Expression& x) {
+  h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
+  vector<Expression>& ht = h.back();
+  vector<Expression>& ct = c.back();
+  Expression in = x;
+  for (unsigned i = 0; i < layers; ++i) {
+    const vector<Expression>& vars = param_vars[i];
+    Expression i_h_tm1, i_c_tm1;
+    if (prev < 0) {
+      if (has_initial_state) {
+        // initial value for h and c at timestep 0 in layer i
+        // defaults to zero matrix input if not set in add_parameter_edges
+        i_h_tm1 = h0[i];
+        i_c_tm1 = c0[i];
+      } else {
+	i_h_tm1 = zeroes(*_cg, Dim({vars[_BI].dim()[0]/4}, x.dim().bd));
+	i_c_tm1 = i_h_tm1;
+      }
+    } else {  // t > 0
+      i_h_tm1 = h[prev][i];
+      i_c_tm1 = c[prev][i];
+    }
+    // TODO: could extend lstm nodes to takes several inputs that will be concatenated internally, would save memory by avoiding concatenate() operation for bidirectional LSTMs
+    // TODO: smaller speed / memory gains by making a version of the lstm gates that assume c or h inputs to be zero (for beginning of sequence)
+    if (dropout_rate_h > 0.f){
+      // apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
+      Expression gates_t = vanilla_lstm_gates(in, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], masks[i][0], masks[i][1], weightnoise_std);
+      ct[i] = vanilla_lstm_c(i_c_tm1, gates_t);
+      in = ht[i] = vanilla_lstm_h(ct[i], gates_t);
+    } else {
+      Expression gates_t = vanilla_lstm_gates(in, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], weightnoise_std);
+      ct[i] = vanilla_lstm_c(i_c_tm1, gates_t);
+      in = ht[i] = vanilla_lstm_h(ct[i], gates_t);
+    }
+  }
+  return ht.back();
+}
+
+void CompactVanillaLSTMBuilder::copy(const RNNBuilder & rnn) {
+  const CompactVanillaLSTMBuilder & rnn_lstm = (const CompactVanillaLSTMBuilder&)rnn;
+  DYNET_ARG_CHECK(params.size() == rnn_lstm.params.size(),
+                          "Attempt to copy CompactVanillaLSTMBuilder with different number of parameters "
+                          "(" << params.size() << " != " << rnn_lstm.params.size() << ")");
+  for (size_t i = 0; i < params.size(); ++i)
+    for (size_t j = 0; j < params[i].size(); ++j)
+      params[i][j] = rnn_lstm.params[i][j];
+}
+
+void CompactVanillaLSTMBuilder::set_dropout(float d) {
+  DYNET_ARG_CHECK(d >= 0.f && d <= 1.f,
+                          "dropout rate must be a probability (>=0 and <=1)");
+  dropout_rate = d;
+  dropout_rate_h = d;
+}
+
+void CompactVanillaLSTMBuilder::set_dropout(float d, float d_h) {
+  DYNET_ARG_CHECK(d >= 0.f && d <= 1.f && d_h >= 0.f && d_h <= 1.f,
+                          "dropout rate must be a probability (>=0 and <=1)");
+  dropout_rate = d;
+  dropout_rate_h = d_h;
+}
+
+void CompactVanillaLSTMBuilder::disable_dropout() {
+  dropout_rate = 0.f;
+  dropout_rate_h = 0.f;
+}
+void CompactVanillaLSTMBuilder::set_weightnoise(float std) {
+  DYNET_ARG_CHECK(std >= 0.f, "weight noise must have standard deviation >=0");
+  weightnoise_std = std;
+}
+
+
+
 } // namespace dynet