-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd.py
118 lines (102 loc) · 5.11 KB
/
add.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import torch
from torch import autocast
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import UNet2DConditionModel, LMSDiscreteScheduler
from PIL import Image
class StableDiffusionModel:
def __init__(self, model_id):
# Load the components
self.tokenizer = CLIPTokenizer.from_pretrained(model_id)
self.text_encoder = CLIPTextModel.from_pretrained(model_id)
self.unet = UNet2DConditionModel.from_pretrained(model_id)
self.scheduler = LMSDiscreteScheduler.from_pretrained(model_id)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def encode_text(self, prompt):
# Encode the text prompt
text_input = self.tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77)
with torch.no_grad():
text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]
return text_embedding
def sample_latent_noise(self, batch_size, latent_shape):
# Sample random noise
return torch.randn((batch_size, *latent_shape)).to(self.device)
def denoise(self, latents, text_embedding, timestep):
# Predict noise with UNet
with torch.no_grad():
noise_pred = self.unet(latents, timestep, encoder_hidden_states=text_embedding).sample
return noise_pred
def step_scheduler(self, noise_pred, timestep, latents):
# Update latents based on the noise prediction
latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
return latents
def decode_latents(self, latents):
# Decode the latents to an image
latents = latents.cpu().detach().numpy()
image = ((latents + 1.0) * 127.5).astype("uint8")
return Image.fromarray(image)
def generate_image(self, prompt, num_steps=50, guidance_scale=7.5):
# Main image generation loop
text_embedding = self.encode_text(prompt)
latents = self.sample_latent_noise(batch_size=1, latent_shape=(4, 64, 64))
for timestep in self.scheduler.timesteps:
with autocast("cuda"):
noise_pred = self.denoise(latents, text_embedding, timestep)
noise_pred = noise_pred * guidance_scale
latents = self.step_scheduler(noise_pred, timestep, latents)
image = self.decode_latents(latents)
return image
# Example usage:
model = StableDiffusionModel(model_id="CompVis/stable-diffusion-v1-4")
image = model.generate_image("A sunset over a futuristic city")
image.save("generated_image.png")
import torch
from torch import autocast
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import UNet2DConditionModel, LMSDiscreteScheduler
from PIL import Image
class StableDiffusionModel:
def __init__(self, model_id):
# Load the components
self.tokenizer = CLIPTokenizer.from_pretrained(model_id)
self.text_encoder = CLIPTextModel.from_pretrained(model_id)
self.unet = UNet2DConditionModel.from_pretrained(model_id)
self.scheduler = LMSDiscreteScheduler.from_pretrained(model_id)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def encode_text(self, prompt):
# Encode the text prompt
text_input = self.tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77)
with torch.no_grad():
text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]
return text_embedding
def sample_latent_noise(self, batch_size, latent_shape):
# Sample random noise
return torch.randn((batch_size, *latent_shape)).to(self.device)
def denoise(self, latents, text_embedding, timestep):
# Predict noise with UNet
with torch.no_grad():
noise_pred = self.unet(latents, timestep, encoder_hidden_states=text_embedding).sample
return noise_pred
def step_scheduler(self, noise_pred, timestep, latents):
# Update latents based on the noise prediction
latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
return latents
def decode_latents(self, latents):
# Decode the latents to an image
latents = latents.cpu().detach().numpy()
image = ((latents + 1.0) * 127.5).astype("uint8")
return Image.fromarray(image)
def generate_image(self, prompt, num_steps=50, guidance_scale=7.5):
# Main image generation loop
text_embedding = self.encode_text(prompt)
latents = self.sample_latent_noise(batch_size=1, latent_shape=(4, 64, 64))
for timestep in self.scheduler.timesteps:
with autocast("cuda"):
noise_pred = self.denoise(latents, text_embedding, timestep)
noise_pred = noise_pred * guidance_scale
latents = self.step_scheduler(noise_pred, timestep, latents)
image = self.decode_latents(latents)
return image
# Example usage:
model = StableDiffusionModel(model_id="CompVis/stable-diffusion-v1-4")
image = model.generate_image("A sunset over a futuristic city")
image.save("generated_image.png")