Merge pull request #117 from inspirehep/keras-2-compatibility

Release Magpie 2.0
inspirehep · Oct 14, 2017 · f48e42c · f48e42c
2 parents 1ca6ea1 + 0b7f6f9
commit f48e42c
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -4,10 +4,9 @@ Magpie is a deep learning tool for multi-label text classification. It learns on
 
 ## Very short introduction
 ```
->>> from magpie import MagpieModel
->>> magpie = MagpieModel()
+>>> magpie = Magpie()
 >>> magpie.init_word_vectors('/path/to/corpus', vec_dim=100)
->>> magpie.train('/path/to/corpus', ['label1', 'label2', 'label3'], nb_epochs=3)
+>>> magpie.train('/path/to/corpus', ['label1', 'label2', 'label3'], epochs=3)
 Training...
 >>> magpie.predict_from_text('Well, that was quick!')
 [('label1', 0.96), ('label3', 0.65), ('label2', 0.21)]
@@ -24,9 +23,9 @@ $ ls data/hep-categories
 
 Before you train the model, you need to build appropriate word vector representations for your corpus. In theory, you can train them on a different corpus or reuse already trained ones ([tutorial](http://rare-technologies.com/word2vec-tutorial/)), however Magpie enables you to do that as well.
 ```python
-from magpie import MagpieModel
+from magpie import Magpie
 
-magpie = MagpieModel()
+magpie = Magpie()
 magpie.train_word2vec('data/hep-categories', vec_dim=100)
 ```
 
@@ -41,10 +40,10 @@ You would usually want to combine those two steps, by simply running:
 magpie.init_word_vectors('data/hep-categories', vec_dim=100)
 ```
 
-If you plan to reuse the trained word representations, you might want to save them and pass in the constructor to `MagpieModel` next time. For the training, just type:
+If you plan to reuse the trained word representations, you might want to save them and pass in the constructor to `Magpie` next time. For the training, just type:
 ```python
 labels = ['Gravitation and Cosmology', 'Experiment-HEP', 'Theory-HEP']
-magpie.train('data/hep-categories', labels, test_ratio=0.2, nb_epochs=30)
+magpie.train('data/hep-categories', labels, test_ratio=0.2, epochs=30)
 ```
 By providing the `test_ratio` argument, the model splits data into train & test datasets (in this example into 80/20 ratio) and evaluates itself after every epoch displaying it's current loss and accuracy. The default value of `test_ratio` is 0 meaning that all the data will be used for training.
 
@@ -63,7 +62,7 @@ Trained models can be used for prediction with methods:
  ('Theory-HEP', 0.20917746)]
 ```
 ## Saving & loading the model
-A `MagpieModel` object consists of three components - the word2vec mappings, a scaler and a `keras` model. In order to train Magpie you can either provide the word2vec mappings and a scaler in advance or let the program compute them for you on the training data. Usually you would want to train them yourself on a full dataset and reuse them afterwards. You can use the provided functions for that purpose:
+A `Magpie` object consists of three components - the word2vec mappings, a scaler and a `keras` model. In order to train Magpie you can either provide the word2vec mappings and a scaler in advance or let the program compute them for you on the training data. Usually you would want to train them yourself on a full dataset and reuse them afterwards. You can use the provided functions for that purpose:
 
 ```python
 magpie.save_word2vec_model('/save/my/embeddings/here')
@@ -74,7 +73,7 @@ magpie.save_model('/save/my/model/here.h5')
 When you want to reinitialize your trained model, you can run:
 
 ```python
-magpie = MagpieModel(
+magpie = Magpie(
     keras_model='/save/my/model/here.h5',
     word2vec_model='/save/my/embeddings/here',
     scaler='/save/my/scaler/here',
@@ -87,9 +86,12 @@ or just pass the objects directly!
 
 The package is not on PyPi, but you can get it directly from GitHub:
 ```
-$ pip install git+https://github.com/inspirehep/magpie.git@v1.0
+$ pip install git+https://github.com/inspirehep/magpie.git@v2.0
 ```
 If you encounter any problems with the installation, make sure to install the correct versions of dependencies listed in `setup.py` file.
 
+## Magpie v1.0 vs v2.0
+Magpie v1.0 depends on Keras v1.X, while Magpie v2.0 on Keras v2.X. You can install and use either of those, but bear in mind that only v2.0 will be developed in the future. If you have troubles with installation, make sure that both Magpie and Keras have the same major version.
+
 ## Contact
 If you have any problems, feel free to open an issue. We'll do our best to help :+1:
diff --git a/magpie/__init__.py b/magpie/__init__.py
@@ -1 +1 @@
-from .main import MagpieModel
+from .main import Magpie
diff --git a/magpie/config.py b/magpie/config.py
@@ -11,7 +11,7 @@
 
 # Training parameters
 BATCH_SIZE = 64
-NB_EPOCHS = 1
+EPOCHS = 1
 
 # Number of tokens to save from the abstract, zero padded
 SAMPLE_LENGTH = 200
diff --git a/magpie/main.py b/magpie/main.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals, print_function, division
 
+import math
 import os
 import sys
 from six import string_types
@@ -9,13 +10,13 @@
 
 from magpie.base.document import Document
 from magpie.base.word2vec import train_word2vec, fit_scaler
-from magpie.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, NB_EPOCHS
+from magpie.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, EPOCHS
 from magpie.nn.input_data import get_data_for_model
 from magpie.nn.models import get_nn_model
 from magpie.utils import save_to_disk, load_from_disk
 
 
-class MagpieModel(object):
+class Magpie(object):
 
     def __init__(self, keras_model=None, word2vec_model=None, scaler=None,
                  labels=None):
@@ -38,7 +39,7 @@ def __init__(self, keras_model=None, word2vec_model=None, scaler=None,
 
     def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
               nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0,
-              nb_epochs=NB_EPOCHS, verbose=1):
+              epochs=EPOCHS, verbose=1):
         """
         Train the model on given data
         :param train_dir: directory with data files. Text files should end with
@@ -51,7 +52,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
         :param batch_size: size of one batch
         :param test_ratio: the ratio of samples that will be withheld from training
         and used for testing. This can be overridden by test_dir.
-        :param nb_epochs: number of epochs to train
+        :param epochs: number of epochs to train
         :param verbose: 0, 1 or 2. As in Keras.
 
         :return: History object
@@ -99,7 +100,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
             x_train,
             y_train,
             batch_size=batch_size,
-            nb_epoch=nb_epochs,
+            epochs=epochs,
             validation_data=test_data,
             validation_split=test_ratio,
             callbacks=callbacks or [],
@@ -108,7 +109,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
 
     def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
                     nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE,
-                    nb_epochs=NB_EPOCHS, verbose=1):
+                    epochs=EPOCHS, verbose=1):
         """
         Train the model on given data
         :param train_dir: directory with data files. Text files should end with
@@ -119,7 +120,7 @@ def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
         :param callbacks: objects passed to the Keras fit function as callbacks
         :param nn_model: string defining the NN architecture e.g. 'crnn'
         :param batch_size: size of one batch
-        :param nb_epochs: number of epochs to train
+        :param epochs: number of epochs to train
         :param verbose: 0, 1 or 2. As in Keras.
 
         :return: History object
@@ -163,10 +164,13 @@ def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
             scaler=self.scaler,
         )
 
+        nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)})
+        steps_per_epoch = math.ceil(nb_of_files / batch_size)
+
         return self.keras_model.fit_generator(
             train_generator,
-            len({filename[:-4] for filename in os.listdir(train_dir)}),
-            nb_epochs,
+            steps_per_epoch=steps_per_epoch,
+            epochs=epochs,
             validation_data=test_data,
             callbacks=callbacks or [],
             verbose=verbose,

diff --git a/magpie/nn/models.py b/magpie/nn/models.py
@@ -1,8 +1,6 @@
-from keras.layers.convolutional import MaxPooling1D, Convolution1D
-from keras.layers.core import Flatten, Dropout, Dense, Merge
-from keras.layers.normalization import BatchNormalization
-from keras.layers.recurrent import GRU
-from keras.models import Sequential
+from keras.layers import Input, Dense, GRU, Dropout, BatchNormalization, \
+                         MaxPooling1D, Conv1D, Flatten, Concatenate
+from keras.models import Model
 
 from magpie.config import SAMPLE_LENGTH
 
@@ -18,31 +16,33 @@ def get_nn_model(nn_model, embedding, output_length):
 
 def cnn(embedding_size, output_length):
     """ Create and return a keras model of a CNN """
+
     NB_FILTER = 256
     NGRAM_LENGTHS = [1, 2, 3, 4, 5]
 
-    conv_layers = []
+    conv_layers, inputs = [], []
+
     for ngram_length in NGRAM_LENGTHS:
-        ngram_layer = Sequential()
-        ngram_layer.add(Convolution1D(
+        current_input = Input(shape=(SAMPLE_LENGTH, embedding_size))
+        inputs.append(current_input)
+
+        convolution = Conv1D(
             NB_FILTER,
             ngram_length,
-            input_dim=embedding_size,
-            input_length=SAMPLE_LENGTH,
-            init='lecun_uniform',
+            kernel_initializer='lecun_uniform',
             activation='tanh',
-        ))
-        pool_length = SAMPLE_LENGTH - ngram_length + 1
-        ngram_layer.add(MaxPooling1D(pool_length=pool_length))
-        conv_layers.append(ngram_layer)
+        )(current_input)
 
-    model = Sequential()
-    model.add(Merge(conv_layers, mode='concat'))
+        pool_size = SAMPLE_LENGTH - ngram_length + 1
+        pooling = MaxPooling1D(pool_size=pool_size)(convolution)
+        conv_layers.append(pooling)
 
-    model.add(Dropout(0.5))
-    model.add(Flatten())
+    merged = Concatenate()(conv_layers)
+    dropout = Dropout(0.5)(merged)
+    flattened = Flatten()(dropout)
+    outputs = Dense(output_length, activation='sigmoid')(flattened)
 
-    model.add(Dense(output_length, activation='sigmoid'))
+    model = Model(inputs=inputs, outputs=outputs)
 
     model.compile(
         loss='binary_crossentropy',
@@ -57,20 +57,21 @@ def rnn(embedding_size, output_length):
     """ Create and return a keras model of a RNN """
     HIDDEN_LAYER_SIZE = 256
 
-    model = Sequential()
+    inputs = Input(shape=(SAMPLE_LENGTH, embedding_size))
 
-    model.add(GRU(
+    gru = GRU(
         HIDDEN_LAYER_SIZE,
-        input_dim=embedding_size,
-        input_length=SAMPLE_LENGTH,
-        init='glorot_uniform',
-        inner_init='normal',
+        input_shape=(SAMPLE_LENGTH, embedding_size),
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer='normal',
         activation='relu',
-    ))
-    model.add(BatchNormalization())
-    model.add(Dropout(0.1))
+    )(inputs)
+
+    batch_normalization = BatchNormalization()(gru)
+    dropout = Dropout(0.1)(batch_normalization)
+    outputs = Dense(output_length, activation='sigmoid')(dropout)
 
-    model.add(Dense(output_length, activation='sigmoid'))
+    model = Model(inputs=inputs, outputs=outputs)
 
     model.compile(
         loss='binary_crossentropy',

diff --git a/magpie/tests/test_api.py b/magpie/tests/test_api.py
@@ -2,22 +2,43 @@
 import os
 import unittest
 
+from magpie import Magpie
+
 # This one is hacky, but I'm too lazy to do it properly!
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 DATA_DIR = os.path.join(PROJECT_DIR, 'data', 'hep-categories')
 
 class TestAPI(unittest.TestCase):
 	""" Basic integration test """
-	def test_integrity(self):
+	def test_cnn_train(self):
+		# Get them labels!
+		with io.open(DATA_DIR + '.labels', 'r') as f:
+			labels = {line.rstrip('\n') for line in f}
+
+		# Run the model
+		model = Magpie()
+		model.init_word_vectors(DATA_DIR, vec_dim=100)
+		history = model.train(DATA_DIR, labels, nn_model='cnn', test_ratio=0.3, epochs=3)
+		assert history is not None
+
+		# Do a simple prediction
+		predictions = model.predict_from_text("Black holes are cool!")
+		assert len(predictions) == len(labels)
+
+		# Assert the hell out of it!
+		for lab, val in predictions:
+			assert lab in labels
+			assert 0 <= val <= 1
+
+	def test_rnn_batch_train(self):
 		# Get them labels!
 		with io.open(DATA_DIR + '.labels', 'r') as f:
 			labels = {line.rstrip('\n') for line in f}
 
 		# Run the model
-		from magpie import MagpieModel
-		model = MagpieModel()
+		model = Magpie()
 		model.init_word_vectors(DATA_DIR, vec_dim=100)
-		history = model.train(DATA_DIR, labels, test_ratio=0.3, nb_epochs=3)
+		history = model.batch_train(DATA_DIR, labels, nn_model='rnn', epochs=3)
 		assert history is not None
 
 		# Do a simple prediction

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='1.0',
+    version='2.0',
 
     description='Automatic text classification tool',
     # long_description=long_description,
@@ -73,7 +73,7 @@
         'scipy~=0.18',
         'gensim~=0.13',
         'scikit-learn~=0.18',
-        'keras~=1.2.2',
+        'keras~=2.0',
         'h5py~=2.6',
     ],