[BUG] model.save method does not save trained W&D TF model with two optimizers #669

rnyak · 2021-03-19T21:54:25Z

Describe the bug

Outbrain W&D TF model cannot be saved after training is completed, and results in error ( see below). We need to save the model properly to be able to serve it at inference step.

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-19-5fe45a9d7548> in <module>
----> 1 model.save('./outbrain_tf/1/model.savedmodel')
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in save(self, filepath, overwrite, include_optimizer, save_format, signatures, options, save_traces)
   1999     """
   2000     # pylint: enable=line-too-long
-> 2001     save.save_model(self, filepath, overwrite, include_optimizer, save_format,
   2002                     signatures, options, save_traces)
   2003 
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/save.py in save_model(model, filepath, overwrite, include_optimizer, save_format, signatures, options, save_traces)
    154         model, filepath, overwrite, include_optimizer)
    155   else:
--> 156     saved_model_save.save(model, filepath, overwrite, include_optimizer,
    157                           signatures, options, save_traces)
    158 
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/save.py in save(model, filepath, overwrite, include_optimizer, signatures, options, save_traces)
     87     with distribution_strategy_context._get_default_replica_context():  # pylint: disable=protected-access
     88       with utils.keras_option_scope(save_traces):
---> 89         save_lib.save(model, filepath, signatures, options)
     90 
     91   if not include_optimizer:
/opt/conda/lib/python3.8/site-packages/tensorflow/python/saved_model/save.py in save(obj, export_dir, signatures, options)
   1030   meta_graph_def = saved_model.meta_graphs.add()
   1031 
-> 1032   _, exported_graph, object_saver, asset_info = _build_meta_graph(
   1033       obj, signatures, options, meta_graph_def)
   1034   saved_model.saved_model_schema_version = constants.SAVED_MODEL_SCHEMA_VERSION
/opt/conda/lib/python3.8/site-packages/tensorflow/python/saved_model/save.py in _build_meta_graph(obj, signatures, options, meta_graph_def)
   1196 
   1197   with save_context.save_context(options):
-> 1198     return _build_meta_graph_impl(obj, signatures, options, meta_graph_def)
/opt/conda/lib/python3.8/site-packages/tensorflow/python/saved_model/save.py in _build_meta_graph_impl(obj, signatures, options, meta_graph_def)
   1160         function_aliases[fdef.name] = alias
   1161 
-> 1162   object_graph_proto = _serialize_object_graph(saveable_view,
   1163                                                asset_info.asset_index)
   1164   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
/opt/conda/lib/python3.8/site-packages/tensorflow/python/saved_model/save.py in _serialize_object_graph(saveable_view, asset_file_def_index)
    752 
    753   for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
--> 754     _write_object_proto(obj, obj_proto, asset_file_def_index,
    755                         saveable_view.function_name_map)
    756   return proto
/opt/conda/lib/python3.8/site-packages/tensorflow/python/saved_model/save.py in _write_object_proto(obj, proto, asset_file_def_index, function_name_map)
    798           version=versions_pb2.VersionDef(
    799               producer=1, min_consumer=1, bad_consumers=[]),
--> 800           metadata=obj._tracking_metadata)
    801       # pylint:enable=protected-access
    802     proto.user_object.CopyFrom(registered_type_proto)
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in _tracking_metadata(self)
   3077   @property
   3078   def _tracking_metadata(self):
-> 3079     return self._trackable_saved_model_saver.tracking_metadata
   3080 
   3081   def _list_extra_dependencies_for_serialization(self, serialization_cache):
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/base_serialization.py in tracking_metadata(self)
     53     # TODO(kathywu): check that serialized JSON can be loaded (e.g., if an
     54     # object is in the python property)
---> 55     return json_utils.Encoder().encode(self.python_properties)
     56 
     57   def list_extra_dependencies_for_serialization(self, serialization_cache):
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/layer_serialization.py in python_properties(self)
     39   def python_properties(self):
     40     # TODO(kathywu): Add python property validator
---> 41     return self._python_properties_internal()
     42 
     43   def _python_properties_internal(self):
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/model_serialization.py in _python_properties_internal(self)
     39 
     40     metadata.update(
---> 41         saving_utils.model_metadata(
     42             self.obj, include_optimizer=True, require_config=False))
     43     return metadata
/opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/saving/saving_utils.py in model_metadata(model, include_optimizer, require_config)
    189                 generic_utils.get_registered_name(model.optimizer.__class__),
    190             'config':
--> 191                 model.optimizer.get_config()
    192         }
    193       metadata['training_config']['optimizer_config'] = optimizer_config
AttributeError: 'ListWrapper' object has no attribute 'get_config'

The reason for that we use two optimizers as list, one for deep model, one for wide model as below:

wide_and_deep_model.compile(
    optimizer=[wide_optimizer, deep_optimizer],
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
    experimental_run_tf_function=False
)

A related bug was already reported: tensorflow/tensorflow#38143. A workaround was proposed.

Potential solutions:

Save model with include_optimizer=False param.
.wide_and_deep_model.save('/outbrain/outbrain_tf/1/model.savedmodel', include_optimizer=False)

this seems like it works at first, but then we get the same error above when we do

from nvtabular.inference.triton import export_tensorflow_ensemble
export_tensorflow_ensemble(wide_and_deep_model, workflow, "outbrain", "/outbrain/models/", ["clicked"])

Looks like TF doesn't provide an elegant method to save models with multiple optimizers. Maybe we could extend the ListWrapper and provide such method by combining the dict of configs of all optimizers.

Dawid from JOC team has his own solution to save model with two optimizers: https://gitlab-master.nvidia.com/dl/JoC/wide_deep_tf2/-/blob/inference/trainer/model/layers.py#L163-174

Steps/Code to reproduce bug
You can visit this notebook: https://github.com/rnyak/NVTabular/blob/outbrain-inf/examples/inference_triton/inference-Outbrain/03-Training-with-TF.ipynb

Environment details (please complete the following information):

Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]: Docker
Method of NVTabular install: [conda, Docker, or from source]: merlin-tf-training container.
- If method of install is [Docker], provide docker pull & docker run commands used

*Additional comments

Another issue is related to wide_and_deep_model.inputs return empty therefore we get an error from _generate_tensorflow_config because model.inputs is none.

    """given a workflow generates the trton modelconfig proto object describing the inputs
    and outputs to that workflow"""
    config = model_config.ModelConfig(
        name=name, backend="tensorflow", platform="tensorflow_savedmodel"
    )

    for col in model.inputs:
        config.input.append(
            model_config.ModelInput(
                name=col.name, data_type=_convert_dtype(col.dtype), dims=[-1, 1]
            )
        )
       ....

A solution to that could be doing something as here: https://github.com/tensorflow/tensorflow/blob/v2.4.1/tensorflow/python/keras/premade/wide_deep.py#L94-L97

The text was updated successfully, but these errors were encountered:

rnyak · 2021-03-19T21:55:35Z

@benfred @oyilmaz-nvidia for viz.

Linking other GH issue here #668

rnyak · 2021-03-24T20:33:19Z

@benfred @oyilmaz-nvidia

Two issues to handle here:

How to save model with a list of optimizers?

Tested Solution:

wide_and_deep_model.save('/outbrain/tf_models/outbrain_tf/1/model.savedmodel', include_optimizer=False)

that saves the model.

wide_and_deep_model.inputs returns None.

Tried solution:

model = tf.keras.models.load_model("/outbrain/tf_models/outbrain_tf/1/model.savedmodel")
model.inputs = dnn_model.inputs
model.outputs = dnn_model.outputs

That seems working, and we can create the config.pbtxt for outbrain_nvt, however, when we try to load the model on the server we get this error:

E0324 20:01:20.545014 32015 model_repository_manager.cc:963] failed to load 'outbrain_tf' version 1: 
Invalid argument: unexpected inference input 'TE_ad_id_clicked', allowed inputs are: inputs/TE_ad_id_clicked, inputs/TE_advertiser_id_clicked, inputs/TE_campaign_id_clicked, inputs/TE_document_id_promo_clicked, inputs/TE_publisher_id_clicked, inputs/TE_source_id_clicked, inputs/ad_id, inputs/advertiser_id, inputs/campaign_id, inputs/document_id, inputs/document_id_document_id_promo_sim_categories, inputs/document_id_document_id_promo_sim_entities, inputs/document_id_document_id_promo_sim_topics, inputs/document_id_promo, inputs/geo_location, inputs/geo_location_country, inputs/geo_location_state, inputs/platform, inputs/publish_time_promo_since_published, inputs/publish_time_since_published, inputs/publisher_id, inputs/publisher_id_promo, inputs/source_id, inputs/source_id_promo

Somehow, it wants us to add inputs before the name of each input variable, and I assume this is because we do model.inputs = dnn_model.inputs.

So the issue remains.

benfred · 2021-03-30T18:12:21Z

I'm wondering if this a difference between dnn_model.inputs and model.signatures["serving_default"].inputs (which afaict is what triton uses). Will investigate to see if there is any easy fix here

benfred · 2021-07-22T22:19:17Z

This is fixed by #681

rnyak added the bug Something isn't working label Mar 19, 2021

benfred added Inference TensorFlow P0 labels May 4, 2021

viswa-nvidia added this to the NVTabular v0.6 milestone May 4, 2021

benfred self-assigned this Jun 3, 2021

benfred linked a pull request Jul 22, 2021 that will close this issue

Fix for outbrain TF inference #681

Merged

karlhigley closed this as completed in #681 Jul 23, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] model.save method does not save trained W&D TF model with two optimizers #669

[BUG] model.save method does not save trained W&D TF model with two optimizers #669

rnyak commented Mar 19, 2021

rnyak commented Mar 19, 2021 •

edited

Loading

rnyak commented Mar 24, 2021

benfred commented Mar 30, 2021

benfred commented Jul 22, 2021

[BUG] model.save method does not save trained W&D TF model with two optimizers #669

[BUG] model.save method does not save trained W&D TF model with two optimizers #669

Comments

rnyak commented Mar 19, 2021

rnyak commented Mar 19, 2021 • edited Loading

rnyak commented Mar 24, 2021

benfred commented Mar 30, 2021

benfred commented Jul 22, 2021

rnyak commented Mar 19, 2021 •

edited

Loading