-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiffuser.py
87 lines (55 loc) · 2.36 KB
/
diffuser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from PIL import Image
# Ensure we have a GPU to run stable diffusion
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the stable diffusion model from HuggingFace
model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained(model_id)
pipe.to(device)
# Set scheduler (optional but recommended for speed)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# Function to generate an image
def generate_image(prompt, num_inference_steps=50, guidance_scale=7.5, height=512, width=512):
"""
Generate an image from a text prompt using the Stable Diffusion model.
Args:
prompt (str): The input text for generating an image.
num_inference_steps (int): The number of diffusion steps (higher is slower but better quality).
guidance_scale (float): Controls how much the model focuses on the text prompt.
height (int): The height of the generated image.
width (int): The width of the generated image.
Returns:
image (PIL Image): The generated image.
"""
image = pipe(
prompt,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
height=height,
width=width
).images[0]
return image
# Example usage
prompt = "A futuristic city skyline at sunset"
image = generate_image(prompt)
# Save or show the image
image.save("generated_image.png")
image.show()
from transformers import CLIPTextModel, CLIPTokenizer
# Load pre-trained CLIP model and tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
# Tokenize input prompt
prompt = "A scenic view of mountains during sunset"
inputs = tokenizer(prompt, return_tensors="pt")
# Generate text embeddings
text_embeddings = text_encoder(**inputs).last_hidden_state
# Sample random noise as a starting point for diffusion
latent = torch.randn((1, 4, 64, 64)).to(device)
# Iterate through diffusion steps to denoise
for i in range(num_inference_steps):
noise_pred = model(latent, text_embeddings, timestep=i)
latent = scheduler.step(noise_pred, latent, timestep=i)
# Decode the latent representation back to an image
decoded_image = vae.decode(latent)