From 708307f4d8109fb9f6f05bd40dd9a6a1cbf42305 Mon Sep 17 00:00:00 2001 From: Chenxi Date: Sat, 20 Nov 2021 21:13:24 +0000 Subject: [PATCH 1/3] replicate demo --- README.md | 1 + cog.yaml | 20 +++++++++++++++++ predict.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 cog.yaml create mode 100644 predict.py diff --git a/README.md b/README.md index 6f3af59..b1a2a70 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Y-tech, Kuaishou Technology ### [Project page](https://onion-liu.github.io/BlendGAN) | [Paper](https://arxiv.org/abs/2110.11728) + Abstract: *Generative Adversarial Networks (GANs) have made a dramatic leap in high-fidelity image synthesis and stylized face generation. Recently, a layer-swapping mechanism has been developed to improve the stylization performance. However, this method is incapable of fitting arbitrary styles in a single model and requires hundreds of style-consistent training images for each style. To address the above issues, we propose BlendGAN for arbitrary stylized face generation by leveraging a flexible blending strategy and a generic artistic dataset. Specifically, we first train a self-supervised style encoder on the generic artistic dataset to extract the representations of arbitrary styles. In addition, a weighted blending module (WBM) is proposed to blend face and style representations implicitly and control the arbitrary stylization effect. By doing so, BlendGAN can gracefully fit arbitrary styles in a unified model while avoiding case-by-case preparation of style-consistent training images. To this end, we also present a novel large-scale artistic face dataset AAHQ. Extensive experiments demonstrate that BlendGAN outperforms state-of-the-art methods in terms of visual quality and style diversity for both latent-guided and reference-guided stylized face synthesis.* diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..b734c02 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,20 @@ +build: + gpu: true + python_version: "3.8" + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + - "ninja-build" + python_packages: + - "ipython==7.21.0" + - "torch==1.7.1" + - "torchvision==0.8.2" + - "numpy==1.19.4" + - "tqdm==4.54.1" + - "opencv-python==4.4.0.46" + - "scipy==1.7.2" + - "cmake==3.22.0" + run: + - pip install dlib + +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..a54c053 --- /dev/null +++ b/predict.py @@ -0,0 +1,64 @@ +import torch +import tempfile +import cv2 +import random +import numpy as np +from pathlib import Path +import cog +from ffhq_dataset.gen_aligned_image import FaceAlign +from model import Generator +from psp_encoder.psp_encoders import PSPEncoder +from utils import ten2cv, cv2ten + + +class Predictor(cog.Predictor): + def setup(self): + size = 1024 + latent = 512 + n_mlp = 8 + self.device = 'cuda' + checkpoint = torch.load('pretrained_models/blendgan.pt') + model_dict = checkpoint['g_ema'] + + self.g_ema = Generator(size, latent, n_mlp, channel_multiplier=2).to(self.device) + self.g_ema.load_state_dict(model_dict) + self.g_ema.eval() + self.psp_encoder = PSPEncoder('pretrained_models/psp_encoder.pt', output_size=1024).to(self.device) + self.psp_encoder.eval() + self.fa = FaceAlign() + + @cog.input( + "source", + type=Path, + help="source facial image, it will be aligned and resized to 1024x1024 first", + ) + @cog.input( + "style", + type=Path, + help="style reference facial image, it will be aligned and resized to 1024x1024 first", + ) + def predict(self, source, style): + seed = 0 + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + add_weight_index = 6 + # face alignment + source_img = cv2.imread(str(source)) + style_img = cv2.imread(str(style)) + source_img_crop = self.fa.get_crop_image(source_img) + style_img_crop = self.fa.get_crop_image(style_img) + source_img_ten = cv2ten(source_img_crop, self.device) + style_img_ten = cv2ten(style_img_crop, self.device) + with torch.no_grad(): + sample_style = self.g_ema.get_z_embed(style_img_ten) + sample_in = self.psp_encoder(source_img_ten) + img_out_ten, _ = self.g_ema([sample_in], z_embed=sample_style, add_weight_index=add_weight_index, + input_is_latent=True, return_latents=False, randomize_noise=False) + img_out = ten2cv(img_out_ten) + out = img_out + out_path = Path(tempfile.mkdtemp()) / "out.png" + cv2.imwrite(str(out_path), out) + return out_path From 9799c0114941d93801edc57783e6c65244091219 Mon Sep 17 00:00:00 2001 From: Chenxi Date: Thu, 29 Sep 2022 01:34:01 +0100 Subject: [PATCH 2/3] Upgrade to Cog version 0.1 --- predict.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/predict.py b/predict.py index a54c053..b4c7b05 100644 --- a/predict.py +++ b/predict.py @@ -4,14 +4,15 @@ import random import numpy as np from pathlib import Path -import cog +from cog import BasePredictor, Path, Input + from ffhq_dataset.gen_aligned_image import FaceAlign from model import Generator from psp_encoder.psp_encoders import PSPEncoder from utils import ten2cv, cv2ten -class Predictor(cog.Predictor): +class Predictor(BasePredictor): def setup(self): size = 1024 latent = 512 @@ -27,17 +28,15 @@ def setup(self): self.psp_encoder.eval() self.fa = FaceAlign() - @cog.input( - "source", - type=Path, - help="source facial image, it will be aligned and resized to 1024x1024 first", - ) - @cog.input( - "style", - type=Path, - help="style reference facial image, it will be aligned and resized to 1024x1024 first", - ) - def predict(self, source, style): + def predict( + self, + source: Path = Input( + description="source facial image, it will be aligned and resized to 1024x1024 first", + ), + style: Path = Input( + description="style reference facial image, it will be aligned and resized to 1024x1024 first", + ), + ) -> Path: seed = 0 random.seed(seed) np.random.seed(seed) From ec18bb760047dcdacab0a714b0bb5c0811bd4189 Mon Sep 17 00:00:00 2001 From: Chenxi Date: Thu, 29 Sep 2022 01:36:20 +0100 Subject: [PATCH 3/3] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1a2a70..fe2ab9d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Y-tech, Kuaishou Technology ### [Project page](https://onion-liu.github.io/BlendGAN) | [Paper](https://arxiv.org/abs/2110.11728) - +[![Replicate](https://replicate.com/onion-liu/blendgan/badge)](https://replicate.com/onion-liu/blendgan) Abstract: *Generative Adversarial Networks (GANs) have made a dramatic leap in high-fidelity image synthesis and stylized face generation. Recently, a layer-swapping mechanism has been developed to improve the stylization performance. However, this method is incapable of fitting arbitrary styles in a single model and requires hundreds of style-consistent training images for each style. To address the above issues, we propose BlendGAN for arbitrary stylized face generation by leveraging a flexible blending strategy and a generic artistic dataset. Specifically, we first train a self-supervised style encoder on the generic artistic dataset to extract the representations of arbitrary styles. In addition, a weighted blending module (WBM) is proposed to blend face and style representations implicitly and control the arbitrary stylization effect. By doing so, BlendGAN can gracefully fit arbitrary styles in a unified model while avoiding case-by-case preparation of style-consistent training images. To this end, we also present a novel large-scale artistic face dataset AAHQ. Extensive experiments demonstrate that BlendGAN outperforms state-of-the-art methods in terms of visual quality and style diversity for both latent-guided and reference-guided stylized face synthesis.*