-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvae.py
More file actions
78 lines (60 loc) · 2.77 KB
/
vae.py
File metadata and controls
78 lines (60 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from diffusers import DiffusionPipeline
import torch
import numpy as np
from PIL import Image
import PIL
from typing import Callable, List, Optional, Union
sd_model = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", local_files_only=False)
sd_model.to('cuda') # TODO
sd_model.vae.requires_grad_(False)
sd_model.text_encoder.requires_grad_(False)
sd_model.unet.requires_grad_(False)
sd_model.vae.eval()
sd_model.text_encoder.eval()
sd_model.unet.eval()
"""modified from prepare_mask_and_masked_image https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py"""
def prepare_image(image):
if isinstance(image, torch.Tensor):
# Batch single image
if image.ndim == 3:
assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
# Check image is in [-1, 1]
if image.min() < -1 or image.max() > 1:
raise ValueError("Image should be in [-1, 1] range")
# Image as float32
image = image.to(dtype=torch.float32)
else:
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
image = [image]
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
image = np.concatenate([i[None, :] for i in image], axis=0)
image = image.transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
return image
def encode(im):
latent = sd_model.vae.encode(im).latent_dist.sample()
latent = sd_model.vae.config.scaling_factor * latent
return latent
def decode_latents(latents):
latents = 1 / sd_model.vae.config.scaling_factor * latents
image = sd_model.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
if __name__ == "__main__":
im = Image.open('./test.jpg')
im = prepare_image(im)
with torch.no_grad():
latent = encode(im)
decoded = decode_latents(latent)
decoded_im = sd_model.numpy_to_pil(decoded)
decoded_im[0].save('decode.jpg')