huggingface · echarlaix · Jul 17, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
@@ -23,15 +23,7 @@
 from transformers.utils import is_torch_available
 
 from ...commands.export.onnx import parse_args_onnx
-from ...utils import (
-    DEFAULT_DUMMY_SHAPES,
-    DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
-    DIFFUSION_MODEL_UNET_SUBFOLDER,
-    DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
-    DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-    ONNX_WEIGHTS_NAME,
-    logging,
-)
+from ...utils import DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, logging
 from ...utils.save_utils import maybe_save_preprocessors
 from ..error_utils import AtolError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
@@ -71,8 +63,9 @@ def _get_submodels_and_onnx_configs(
     custom_architecture: bool,
     fn_get_submodels: Optional[Callable] = None,
 ):
+    is_stable_diffusion = "stable-diffusion" in task
     if not custom_architecture:
-        if task == "stable-diffusion":
+        if is_stable_diffusion:
             onnx_config = None
             models_and_onnx_configs = get_stable_diffusion_models_for_export(model)
         else:
@@ -104,7 +97,7 @@ def _get_submodels_and_onnx_configs(
         if fn_get_submodels is not None:
             submodels_for_export = fn_get_submodels(model)
         else:
-            if task == "stable-diffusion":
+            if is_stable_diffusion:
                 submodels_for_export = _get_submodels_for_export_stable_diffusion(model)
             elif (
                 model.config.is_encoder_decoder
@@ -312,7 +305,9 @@ def main_export(
     )
 
     custom_architecture = False
-    if task != "stable-diffusion" and model.config.model_type.replace(
+    is_stable_diffusion = "stable-diffusion" in task
+
+    if not is_stable_diffusion and model.config.model_type.replace(
         "-", "_"
     ) not in TasksManager.get_supported_model_type_for_task(task, exporter="onnx"):
         custom_architecture = True
@@ -330,7 +325,7 @@ def main_export(
 
     if (
         not custom_architecture
-        and task != "stable-diffusion"
+        and not is_stable_diffusion
         and task + "-with-past"
         in TasksManager.get_supported_tasks_for_model_type(model.config.model_type.replace("_", "-"), "onnx")
     ):
@@ -367,7 +362,7 @@ def main_export(
         fn_get_submodels=fn_get_submodels,
     )
 
-    if task != "stable-diffusion":
+    if not is_stable_diffusion:
         needs_pad_token_id = (
             isinstance(onnx_config, OnnxConfigWithPast)
             and getattr(model.config, "pad_token_id", None) is None
@@ -415,28 +410,23 @@ def main_export(
 
         onnx_files_subpaths = None
     else:
-        onnx_files_subpaths = [
-            DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
-            DIFFUSION_MODEL_UNET_SUBFOLDER,
-            DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-            DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
-        ]
-
         # save the subcomponent configuration
-        for model_name, name_dir in zip(models_and_onnx_configs, onnx_files_subpaths):
+        for model_name in models_and_onnx_configs:
             subcomponent = models_and_onnx_configs[model_name][0]
             if hasattr(subcomponent, "save_config"):
-                subcomponent.save_config(output / name_dir)
+                subcomponent.save_config(output / model_name)
             elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
-                subcomponent.config.save_pretrained(output / name_dir)
+                subcomponent.config.save_pretrained(output / model_name)
 
-        onnx_files_subpaths = [os.path.join(path, ONNX_WEIGHTS_NAME) for path in onnx_files_subpaths]
+        onnx_files_subpaths = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
 
         # Saving the additional components needed to perform inference.
         model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
         model.scheduler.save_pretrained(output.joinpath("scheduler"))
-        if model.feature_extractor is not None:
+        if getattr(model, "feature_extractor", None) is not None:
             model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+        if getattr(model, "tokenizer_2", None) is not None:
+            model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
         model.save_config(output)
 
     _, onnx_outputs = export_models(
@@ -464,7 +454,7 @@ def main_export(
 
     # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any
     # TODO: treating stable diffusion separately is quite ugly
-    if not no_post_process and task != "stable-diffusion":
+    if not no_post_process and not is_stable_diffusion:
         try:
             logger.info("Post-processing the exported models...")
             models_and_onnx_configs, onnx_files_subpaths = onnx_config.post_process_exported_models(
@@ -475,7 +465,7 @@ def main_export(
                 f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}"
             )
 
-    if task == "stable-diffusion":
+    if is_stable_diffusion:
         use_subprocess = (
             False  # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export.<locals>.<lambda>'
         )

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
@@ -369,6 +369,8 @@ def _run_validation(
         if isinstance(value, (list, tuple)):
             value = config.flatten_output_collection_property(name, value)
             onnx_inputs.update({tensor_name: pt_tensor.cpu().numpy() for tensor_name, pt_tensor in value.items()})
+        elif isinstance(value, (dict)):
+            onnx_inputs.update({tensor_name: pt_tensor.cpu().numpy() for tensor_name, pt_tensor in value.items()})
         else:
             onnx_inputs[name] = value.cpu().numpy()
 

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -658,14 +658,15 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         }
 
 
-class CLIPTextOnnxConfig(TextEncoderOnnxConfig):
+class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
     # The ONNX export of this architecture needs the Trilu operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
 
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         vocab_size="vocab_size",
         sequence_length="max_position_embeddings",
+        num_layers="num_hidden_layers",
         allow_new=True,
     )
 
@@ -677,13 +678,33 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
-        return {
+        outputs = {
+            "text_embeds": {0: "batch_size", 1: "sequence_length"},
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+        if self._normalized_config.output_hidden_states:
+            for i in range(self._normalized_config.num_layers + 1):
+                outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
+
+        return outputs
+
+
+class CLIPTextOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        outputs = {
             "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
             "pooler_output": {0: "batch_size"},
         }
+        if self._normalized_config.output_hidden_states:
+            for i in range(self._normalized_config.num_layers + 1):
+                outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
+
+        return outputs
 
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+
         if framework == "pt":
             import torch
 
@@ -713,12 +734,19 @@ class UNetOnnxConfig(VisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {
+        inputs = {
             "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
             "timestep": {0: "steps"},
             "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
         }
 
+        # TODO : add text_image, image and image_embeds
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            inputs["text_embeds"] = {0: "batch_size"}
+            inputs["time_ids"] = {0: "batch_size"}
+
+        return inputs
+
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
@@ -734,8 +762,18 @@ def torch_to_onnx_output_map(self) -> Dict[str, str]:
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
         dummy_inputs["encoder_hidden_states"] = dummy_inputs["encoder_hidden_states"][0]
+
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            dummy_inputs["added_cond_kwargs"] = {
+                "text_embeds": dummy_inputs.pop("text_embeds"),
+                "time_ids": dummy_inputs.pop("time_ids"),
+            }
+
         return dummy_inputs
 
+    def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
+        return self.inputs
+
 
 class VaeEncoderOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-2

diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
@@ -100,14 +100,23 @@ def _get_submodels_for_export_stable_diffusion(
     """
     Returns the components of a Stable Diffusion model.
     """
+    from diffusers import StableDiffusionXLPipeline
+
     models_for_export = {}
 
+    if isinstance(pipeline, StableDiffusionXLPipeline):
+        pipeline.text_encoder.config.output_hidden_states = True
+        projection_dim = pipeline.text_encoder_2.config.projection_dim
+    else:
+        projection_dim = pipeline.text_encoder.config.projection_dim
+
     # Text encoder
     models_for_export["text_encoder"] = pipeline.text_encoder
 
     # U-NET
     # PyTorch does not support the ONNX export of torch.nn.functional.scaled_dot_product_attention
     pipeline.unet.set_attn_processor(AttnProcessor())
+    pipeline.unet.config.text_encoder_projection_dim = projection_dim
     models_for_export["unet"] = pipeline.unet
 
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
@@ -124,6 +133,10 @@ def _get_submodels_for_export_stable_diffusion(
     vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
     models_for_export["vae_decoder"] = vae_decoder
 
+    if getattr(pipeline, "text_encoder_2", None) is not None:
+        pipeline.text_encoder_2.config.output_hidden_states = True
+        models_for_export["text_encoder_2"] = pipeline.text_encoder_2
+
     return models_for_export
 
 
@@ -278,6 +291,16 @@ def get_stable_diffusion_models_for_export(
     vae_onnx_config = vae_config_constructor(vae_decoder.config)
     models_for_export["vae_decoder"] = (vae_decoder, vae_onnx_config)
 
+    if "text_encoder_2" in models_for_export:
+        onnx_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=pipeline.text_encoder_2,
+            exporter="onnx",
+            task="feature-extraction",
+            model_type="clip-text-with-projection",
+        )
+        onnx_config = onnx_config_constructor(pipeline.text_encoder_2.config)
+        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], onnx_config)
+
     return models_for_export
 
 

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
@@ -171,6 +171,7 @@ class TasksManager:
             "audio-xvector": "AutoModelForAudioXVector",
             "image-to-text": "AutoModelForVision2Seq",
             "stable-diffusion": "StableDiffusionPipeline",
+            "stable-diffusion-xl": "StableDiffusionXLPipeline",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
@@ -267,6 +268,7 @@ class TasksManager:
         "image-to-text": "transformers",
         "sentence-similarity": "transformers",
         "stable-diffusion": "diffusers",
+        "stable-diffusion-xl": "diffusers",
         "summarization": "transformers",
         "visual-question-answering": "transformers",
         "zero-shot-classification": "transformers",
@@ -390,6 +392,10 @@ class TasksManager:
             "feature-extraction",
             onnx="CLIPTextOnnxConfig",
         ),
+        "clip-text-with-projection": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="CLIPTextWithProjectionOnnxConfig",
+        ),
         "codegen": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -931,7 +937,14 @@ class TasksManager:
             onnx="YolosOnnxConfig",
         ),
     }
-    _UNSUPPORTED_CLI_MODEL_TYPE = {"unet", "vae-encoder", "vae-decoder", "clip-text-model", "trocr"}
+    _UNSUPPORTED_CLI_MODEL_TYPE = {
+        "unet",
+        "vae-encoder",
+        "vae-decoder",
+        "clip-text-model",
+        "clip-text-with-projection",
+        "trocr",
+    }
     _SUPPORTED_CLI_MODEL_TYPE = set(_SUPPORTED_MODEL_TYPE.keys()) - _UNSUPPORTED_CLI_MODEL_TYPE
 
     @classmethod
@@ -1271,7 +1284,7 @@ def _infer_task_from_model_or_model_class(
                 (
                     target_name.startswith("Auto"),
                     target_name.startswith("TFAuto"),
-                    target_name == "StableDiffusionPipeline",
+                    "StableDiffusion" in target_name,
                 )
             ):
                 if target_name == auto_cls_name:
@@ -1314,8 +1327,10 @@ def _infer_task_from_model_name_or_path(
             model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
             if model_info.library_name == "diffusers":
                 # TODO : getattr(model_info, "model_index") defining auto_model_class_name currently set to None
-                if "stable-diffusion" in model_info.tags:
-                    inferred_task_name = "stable-diffusion"
+                for task in ("stable-diffusion-xl", "stable-diffusion"):
+                    if task in model_info.tags:
+                        inferred_task_name = task
+                        break
             else:
                 pipeline_tag = getattr(model_info, "pipeline_tag", None)
                 # conversational is not a supported task per se, just an alias that may map to

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
@@ -71,12 +71,14 @@
         "ORTStableDiffusionPipeline",
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
         "ORTStableDiffusionPipeline",
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 
 
@@ -124,12 +126,14 @@
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            ORTStableDiffusionXLPipeline,
         )
     else:
         from .modeling_diffusion import (
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            ORTStableDiffusionXLPipeline,
         )
 else:
     import sys