SD img2img (#25)

* sd_img2img * update * add benchmark * update Co-authored-by: Terry Chen <terrychen@meta.com>
facebookincubator · Oct 7, 2022 · 445a20e · 445a20e
1 parent 70ff7da
commit 445a20e
Show file tree

Hide file tree

Showing 4 changed files with 493 additions and 3 deletions.
diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
@@ -6,7 +6,7 @@ In this example, we show how to build fast AIT modules for CLIP, UNet, VAE model
 
 First, clone, build, and install AITemplate [per the README instructions](https://github.com/facebookincubator/AITemplate#clone-the-code).
 
-This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`. 
+This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`.
 
 Verify the library versions. We have tested transformers 4.21/4.22/4.23, diffusers 0.3/0.4 and torch 1.11/1.12.
 
@@ -30,6 +30,11 @@ python3 examples/05_stable_diffusion/compile.py --token ACCESS_TOKEN
 ```
 It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
+Compile the img2img models:
+```
+python3 examples/05_stable_diffusion/compile.py --img2img True --token ACCESS_TOKEN
+```
+
 #### Multi-GPU profiling
 AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
 To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
@@ -50,6 +55,12 @@ Run AIT models with an example image:
 python3 examples/05_stable_diffusion/demo.py --token ACCESS_TOKEN
 ```
 
+Img2img demo:
+
+```
+python3 examples/05_stable_diffusion/demo_img2img.py --token ACCESS_TOKEN
+```
+
 Check the resulted image: `example_ait.png`
 
 
@@ -131,10 +142,20 @@ _OOM = Out of Memory_
 | 16         | 7906             | 0.49     |
 
 
+## IMG2IMG
+
+### A100-40GB / CUDA 11.6, 40 steps
+
+| Module   | PT Latency (ms) | AIT Latency (ms) |
+|----------|-----------------|------------------|
+| Pipeline | 4163.60         | 1785.46          |
+
+
 
 ### Note for Performance Results
 
 - For all benchmarks we render the images of size 512x512
+- For img2img model we only support fix input 512x768 by default, stay tuned for dynamic shape support
 - For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
 - To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
 - Performance results are what we can reproduced & take reference. It should not be used for other purposes.
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
@@ -317,9 +317,10 @@ def compile_vae(
 @click.command()
 @click.option("--token", default="", help="access token")
 @click.option("--batch-size", default=1, help="batch size")
+@click.option("--img2img", default=False, help="compile img2img models")
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
-def compile_diffusers(token, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True):
+def compile_diffusers(token, batch_size, img2img=False, use_fp16_acc=True, convert_conv_to_gemm=True):
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
@@ -338,16 +339,19 @@ def compile_diffusers(token, batch_size, use_fp16_acc=True, convert_conv_to_gemm
         use_auth_token=access_token,
     ).to("cuda")
 
+    width = 96 if img2img else 64
+
     # CLIP
     compile_clip(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
     # UNet
     compile_unet(
         batch_size=batch_size * 2,
+        ww=width,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
     )
     # VAE
-    compile_vae(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+    compile_vae(batch_size=batch_size, width=width, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
 
 
 if __name__ == "__main__":

diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/demo_img2img.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from io import BytesIO
+
+import click
+import requests
+import torch
+from PIL import Image
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option(
+    "--prompt", default="A fantasy landscape, trending on artstation", help="prompt"
+)
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(token, prompt, benchmark):
+
+    # load the pipeline
+    device = "cuda"
+    model_id_or_path = "CompVis/stable-diffusion-v1-4"
+    pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
+        model_id_or_path,
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=token,
+    )
+    pipe = pipe.to(device)
+
+    # let's download an initial image
+    url = "https://github.com/raw/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+    response = requests.get(url)
+    init_image = Image.open(BytesIO(response.content)).convert("RGB")
+    init_image = init_image.resize((768, 512))
+
+    with torch.autocast("cuda"):
+        images = pipe(
+            prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5
+        ).images
+        if benchmark:
+            args = (prompt, init_image)
+            t = benchmark_torch_function(10, pipe, *args)
+            print(f"sd e2e: {t} ms")
+
+    images[0].save("fantasy_landscape_ait.png")
+
+
+if __name__ == "__main__":
+    run()