Copyright © 2017-2021 ABBYY Production LLC

[1]:
#@title
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Porting neural networks

Introduction

In this tutorial, we’ll demonstrate how to transfer pretrained neural network from Keras to NeoML.Dnn for further tuning and inference.

The experimental setup consists of a model with embedding layer followed by a fully connected layer and a bidirectional LSTM. The aim of this toy example is to show the basic principles of migration, therefore we do not make any training and use small dimension lengths to make tensors human-readable. The considered model is unlikely to solve any real task.

The tutorial includes the following steps: 1. Setup 1. Prepare models 1. Weights transform and transfer 1. Checking the results

Setup

To prevent possible GPU memory shortage, force Keras to use CPU. This step is optional.

[2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
[3]:
import typing as tp
import numpy as np
np.random.RandomState(42)

import tensorflow as tf
from tensorflow import keras

import neoml

Prepare models

First train your Keras model to get the weigths to transfer (here we transfer random-initialized model). Then you need a NeoML model with the equivalent architecture. Some versions of TensorFlow and NeoML are incompatible due to different CUDA or numpy requirements, fortunately, simultaneous launch is convenient but not mandatory. In this case, create two environments, train Keras and save weigths to files (ex. numpy.save), then load weights and launch NeoML in the other environment.

Compliance table for layers used in this tutorial:

Keras

NeoML

Input

Source

Embedding

MultichannelLookup

Dense

FullyConnected

Dense with activation

FullyConnected + layer from Dnn.Activation

LSTM

LSTM

Bidirectional( layer )

2 x layer + Concat

output

Sink

[4]:
VOCAB_SIZE = 5
EMBEDDING_DIM = 5
DENSE_DIM = 2
LSTM_DIM = 3
NUM_CLASSES = 2
BATCH_SIZE = 3
SEQ_LEN = 2

SOURCE_LAYER_NAME = "Input"
EMBEDDING_LAYER_NAME = "Embedding"
DENSE_LAYER_NAME = "Dense"
LSTM_LAYER_NAME = "LSTM"
FWD_LSTM_LAYER_NAME = "LSTM-forward"
BWD_LSTM_LAYER_NAME = "LSTM-backward"
[5]:
def build_keras_model() -> keras.Model:
    inputs = keras.layers.Input(shape=(None,), dtype=np.int32)

    embeddings = keras.layers.Embedding(name=EMBEDDING_LAYER_NAME,
                                        output_dim=EMBEDDING_DIM,
                                        input_dim=VOCAB_SIZE)(inputs)

    dense = keras.layers.Dense(name=DENSE_LAYER_NAME, units=DENSE_DIM, activation='tanh')(embeddings)

    lstm = keras.layers.Bidirectional(name=LSTM_LAYER_NAME,
                                      layer=keras.layers.LSTM(
                                          units=LSTM_DIM,
                                          return_sequences=True,
                                          return_state=True))(dense)
    outputs = [dense, lstm]
    return keras.Model(inputs, outputs)

[6]:
class NeomlModel():
    def __init__(self, useCuda=False):
        self._engine = neoml.MathEngine.GpuMathEngine(0) if useCuda else neoml.MathEngine.CpuMathEngine(0)
        self._dnn = neoml.Dnn.Dnn(self._engine)
        source = neoml.Dnn.Source(self._dnn, SOURCE_LAYER_NAME)

        embedding = neoml.Dnn.MultichannelLookup(
            source, dimensions=[(VOCAB_SIZE, EMBEDDING_DIM)], name=EMBEDDING_LAYER_NAME)
        neoml.Dnn.Sink(embedding, name=EMBEDDING_LAYER_NAME + "-output")

        dense = neoml.Dnn.FullyConnected(embedding, DENSE_DIM, name="Dense")
        dense_activated = neoml.Dnn.Activation.Tanh(dense)
        neoml.Dnn.Sink(dense_activated, name=DENSE_LAYER_NAME + "-output")

        fwd_rnn = neoml.Dnn.Lstm(dense_activated, LSTM_DIM, name=FWD_LSTM_LAYER_NAME)
        bwd_rnn = neoml.Dnn.Lstm(dense_activated, LSTM_DIM, name=BWD_LSTM_LAYER_NAME, reverse_seq=True)
        join_rnn = neoml.Dnn.Concat.ConcatChannels(input_layers=[fwd_rnn, bwd_rnn])
        # You don't need to insert so many sink layers normally. We create them for debug purposes only.
        neoml.Dnn.Sink(join_rnn, name=LSTM_LAYER_NAME + "-output")
        s = neoml.Dnn.Sink(fwd_rnn, name=FWD_LSTM_LAYER_NAME + "-cell")
        s.connect(fwd_rnn, 1)
        s = neoml.Dnn.Sink(bwd_rnn, name=BWD_LSTM_LAYER_NAME + "-cell")
        s.connect(bwd_rnn, 1)

    @property
    def engine(self) -> neoml.MathEngine:
        return self._engine

    @property
    def dnn(self) -> neoml.Dnn:
        return self._dnn

    def __getitem__(self, key: str) -> neoml.Dnn.Layer:
        return self._dnn.layers[key]

    def asblob(self, array: np.ndarray, shape: tp.Tuple[7 * (float, )]) -> neoml.Blob:
        assert array.size == np.prod(shape), "check array size"
        assert array.shape == tuple(i for i in shape if i != 1), "check the order of dimensions, transpose if needed"
        return neoml.Blob.asblob(self._engine, array, shape)

[7]:
keras_model = build_keras_model()
neo_model = NeomlModel()
[8]:
# create same pieces of data to feed the models
np_blob = np.random.randint(1, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
neo_blob = neo_model.asblob(np_blob.T, (SEQ_LEN, BATCH_SIZE, 1, 1, 1, 1, 1))

# just to initialize NEOmodel
neo_model.dnn.run({SOURCE_LAYER_NAME: neo_blob})
np_blob
[8]:
array([[1, 1],
       [1, 3],
       [4, 4]])

Weights transform and transfer

Finally we can load the prepared weights into the NeoML model. As you will see below, most weights can be transferred ‘as-is’. The only exception is LSTM. Note the main differences between Keras and NeoML implementations: - Keras LSTM is batch-first by default while NeoML LSTM is time-first. In terms of the Seven-dimensional tensor, batch length is a sequence length, batch width is a batch size, and channels is a hidden dimension length. - NeoML doesn’t support Keras masks: LSTM processes the whole sequence with padding. - The order of gates in concatenated kernels is non-standard: (cell | forget | input | output) instead of (input | forget | cell | output). In addition, cell is called main gate, output gatereset gate.

Embedding layer

[9]:
embedding_blob = neo_model.asblob(keras_model.get_layer(EMBEDDING_LAYER_NAME).weights[0].numpy(),
                                  neo_model[EMBEDDING_LAYER_NAME].get_embeddings(0).shape)
neo_model[EMBEDDING_LAYER_NAME].set_embeddings(index=0, blob=embedding_blob)

Dense layer

[10]:
keras_dense = keras_model.get_layer(DENSE_LAYER_NAME)
dense_kernel = neo_model.asblob(keras_dense.weights[0].numpy().T, neo_model[DENSE_LAYER_NAME].weights.shape)
dense_free_term = neo_model.asblob(keras_dense.weights[1].numpy(), neo_model[DENSE_LAYER_NAME].free_term.shape)
neo_model[DENSE_LAYER_NAME].weights = dense_kernel
neo_model[DENSE_LAYER_NAME].free_term = dense_free_term

LSTM layer

[11]:
def reorder_lstm(weights: np.ndarray, lstm_dim: int) -> np.ndarray:
    permutation = [2, 1, 0, 3]
    # free_term
    if weights.numpy().size == 4 * lstm_dim:
        return weights.numpy().T.reshape(4, lstm_dim)[permutation].reshape(4 * lstm_dim)
    # matrix
    else:
        return weights.numpy().T.reshape(4, lstm_dim, -1)[permutation].reshape(4 * lstm_dim, -1)
[12]:
keras_lstm = keras_model.get_layer(LSTM_LAYER_NAME)

forward_input_kernel = neo_model.asblob(reorder_lstm(keras_lstm.weights[0], LSTM_DIM), neo_model[FWD_LSTM_LAYER_NAME].input_weights.shape)
forward_rec_kernel = neo_model.asblob(reorder_lstm(keras_lstm.weights[1], LSTM_DIM), neo_model[FWD_LSTM_LAYER_NAME].recurrent_weights.shape)
forward_free_term = neo_model.asblob(reorder_lstm(keras_lstm.weights[2], LSTM_DIM), neo_model[FWD_LSTM_LAYER_NAME].input_free_term.shape)
backward_input_kernel = neo_model.asblob(reorder_lstm(keras_lstm.weights[3], LSTM_DIM), neo_model[BWD_LSTM_LAYER_NAME].input_weights.shape)
backward_rec_kernel = neo_model.asblob(reorder_lstm(keras_lstm.weights[4], LSTM_DIM), neo_model[BWD_LSTM_LAYER_NAME].recurrent_weights.shape)
backward_free_term = neo_model.asblob(reorder_lstm(keras_lstm.weights[5], LSTM_DIM), neo_model[BWD_LSTM_LAYER_NAME].input_free_term.shape)
# NeoML's LSTM has two free terms to be compatible with ONNX and PyTorch, we leave one of them filled with zeros
zero_free_term = neo_model.asblob(np.zeros(4 * LSTM_DIM, dtype=np.float32), neo_model[FWD_LSTM_LAYER_NAME].recurrent_free_term.shape)

neo_model[FWD_LSTM_LAYER_NAME].input_weights = forward_input_kernel
neo_model[FWD_LSTM_LAYER_NAME].recurrent_weights = forward_rec_kernel
neo_model[FWD_LSTM_LAYER_NAME].input_free_term = forward_free_term
neo_model[FWD_LSTM_LAYER_NAME].recurrent_free_term = zero_free_term
neo_model[BWD_LSTM_LAYER_NAME].input_weights = backward_input_kernel
neo_model[BWD_LSTM_LAYER_NAME].recurrent_weights = backward_rec_kernel
neo_model[BWD_LSTM_LAYER_NAME].input_free_term = backward_free_term
neo_model[BWD_LSTM_LAYER_NAME].recurrent_free_term = zero_free_term

Check the results

[13]:
keras_dense_out, keras_lstm_outs = keras_model(np_blob)
keras_lstm_out = keras_lstm_outs[0].numpy()
# No need to check h_n since the LSTM has single layer and all hidden states are included in lstm_out
keras_fwd_cell = keras_lstm_outs[2].numpy()
keras_bwd_cell = keras_lstm_outs[4].numpy()
[14]:
output = neo_model.dnn.run({"Input": neo_blob})
neo_dense_out = output[DENSE_LAYER_NAME + "-output"].asarray().transpose((1,0,2))
neo_lstm_out = output[LSTM_LAYER_NAME + "-output"].asarray().transpose((1,0,2))
# NeoML returns all cell states for each timestep
neo_fwd_cell = output[FWD_LSTM_LAYER_NAME + "-cell"].asarray()[-1]
neo_bwd_cell = output[BWD_LSTM_LAYER_NAME + "-cell"].asarray()[0]
[15]:
(
    abs(neo_dense_out - keras_dense_out.numpy()).max(),
    abs(neo_lstm_out - keras_lstm_out).max(),
    abs(neo_fwd_cell - keras_fwd_cell).max(),
    abs(neo_bwd_cell - keras_bwd_cell).max()
)
[15]:
(9.313226e-08, 6.030314e-08, 5.401671e-08, 9.49949e-08)