Spaces:
Paused
Paused
Commit
·
6144b88
1
Parent(s):
9fd0a3f
Attempt to fix the clip guidance
Browse files
app.py
CHANGED
|
@@ -19,7 +19,6 @@ if not (path_exists(f"CLIP")):
|
|
| 19 |
Repo.clone_from("https://github.com/openai/CLIP", "CLIP")
|
| 20 |
sys.path.append('v-diffusion-pytorch')
|
| 21 |
|
| 22 |
-
|
| 23 |
from huggingface_hub import hf_hub_download
|
| 24 |
|
| 25 |
from CLIP import clip
|
|
@@ -62,18 +61,23 @@ model = model.half().cuda().eval().requires_grad_(False)
|
|
| 62 |
#model_small.load_state_dict(torch.load(cc12m_model, map_location='cpu'))
|
| 63 |
#model_small = model_small.half().cuda().eval().requires_grad_(False)
|
| 64 |
|
| 65 |
-
print(model.clip_model)
|
| 66 |
clip_model = clip.load(model.clip_model, jit=False, device='cuda')[0]
|
| 67 |
clip_model.eval().requires_grad_(False)
|
| 68 |
normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
|
| 69 |
std=[0.26862954, 0.26130258, 0.27577711])
|
| 70 |
make_cutouts = MakeCutouts(clip_model.visual.input_resolution, 16, 1.)
|
|
|
|
|
|
|
|
|
|
| 71 |
def run_all(prompt, steps, n_images, weight, clip_guided):
|
|
|
|
|
|
|
| 72 |
import random
|
| 73 |
seed = int(random.randint(0, 2147483647))
|
| 74 |
target_embed = clip_model.encode_text(clip.tokenize(prompt).to('cuda')).float()#.cuda()
|
| 75 |
|
| 76 |
if(clip_guided):
|
|
|
|
| 77 |
steps = steps*5
|
| 78 |
clip_guidance_scale = weight*100
|
| 79 |
prompts = [prompt]
|
|
@@ -109,8 +113,7 @@ def run_all(prompt, steps, n_images, weight, clip_guided):
|
|
| 109 |
clip_embed_in = torch.cat([torch.zeros_like(clip_embed_repeat), clip_embed_repeat])
|
| 110 |
v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
|
| 111 |
v = v_uncond + (v_cond - v_uncond) * weight
|
| 112 |
-
return v
|
| 113 |
-
|
| 114 |
def make_cond_model_fn(model, cond_fn):
|
| 115 |
def cond_model_fn(x, t, **extra_args):
|
| 116 |
with torch.enable_grad():
|
|
@@ -132,12 +135,9 @@ def run_all(prompt, steps, n_images, weight, clip_guided):
|
|
| 132 |
grad = -torch.autograd.grad(loss, x)[0]
|
| 133 |
return grad
|
| 134 |
|
| 135 |
-
gc.collect()
|
| 136 |
-
torch.cuda.empty_cache()
|
| 137 |
torch.manual_seed(seed)
|
| 138 |
x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
|
| 139 |
t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
|
| 140 |
-
#step_list = utils.get_spliced_ddpm_cosine_schedule(t)
|
| 141 |
if model.min_t == 0:
|
| 142 |
step_list = utils.get_spliced_ddpm_cosine_schedule(t)
|
| 143 |
else:
|
|
@@ -164,11 +164,10 @@ iface = gr.Interface(
|
|
| 164 |
gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=40,maximum=80,minimum=1,step=1),
|
| 165 |
gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1, step=1),
|
| 166 |
gr.inputs.Slider(label="Weight - how closely the image should resemble the prompt", default=5, maximum=15, minimum=0, step=1),
|
| 167 |
-
gr.inputs.Checkbox(label="CLIP Guided - improves coherence with complex prompts, makes it slower"),
|
| 168 |
],
|
| 169 |
outputs=gallery,
|
| 170 |
title="Generate images from text with V-Diffusion",
|
| 171 |
description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/crowsonkb/v-diffusion-pytorch' target='_blank'>V-Diffusion</a> is diffusion text-to-image model created by <a href='https://twitter.com/RiversHaveWings' target='_blank'>Katherine Crowson</a> and <a href='https://twitter.com/jd_pressman'>JDP</a>, trained on the <a href='https://github.com/google-research-datasets/conceptual-12m'>CC12M dataset</a>. The UI to the model was assembled by <a style='color: rgb(99, 102, 241);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a>, keep up with the <a style='color: rgb(99, 102, 241);' href='https://multimodal.art/news' target='_blank'>latest multimodal ai art news here</a> and consider <a style='color: rgb(99, 102, 241);' href='https://www.patreon.com/multimodalart' target='_blank'>supporting us on Patreon</a></div>",
|
| 172 |
-
#article="<h4 style='font-size: 110%;margin-top:.5em'>Biases acknowledgment</h4><div>Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the <a href='https://arxiv.org/abs/2112.10752' target='_blank'>Latent Diffusion paper</a>:<i> \"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\"</i>. The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. <a href='https://laion.ai/laion-400-open-dataset/' target='_blank'>You can read more on LAION's website</a></div><h4 style='font-size: 110%;margin-top:1em'>Who owns the images produced by this demo?</h4><div>Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
|
| 173 |
)
|
| 174 |
iface.launch(enable_queue=True)
|
|
|
|
| 19 |
Repo.clone_from("https://github.com/openai/CLIP", "CLIP")
|
| 20 |
sys.path.append('v-diffusion-pytorch')
|
| 21 |
|
|
|
|
| 22 |
from huggingface_hub import hf_hub_download
|
| 23 |
|
| 24 |
from CLIP import clip
|
|
|
|
| 61 |
#model_small.load_state_dict(torch.load(cc12m_model, map_location='cpu'))
|
| 62 |
#model_small = model_small.half().cuda().eval().requires_grad_(False)
|
| 63 |
|
|
|
|
| 64 |
clip_model = clip.load(model.clip_model, jit=False, device='cuda')[0]
|
| 65 |
clip_model.eval().requires_grad_(False)
|
| 66 |
normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
|
| 67 |
std=[0.26862954, 0.26130258, 0.27577711])
|
| 68 |
make_cutouts = MakeCutouts(clip_model.visual.input_resolution, 16, 1.)
|
| 69 |
+
gc.collect()
|
| 70 |
+
torch.cuda.empty_cache()
|
| 71 |
+
|
| 72 |
def run_all(prompt, steps, n_images, weight, clip_guided):
|
| 73 |
+
gc.collect()
|
| 74 |
+
torch.cuda.empty_cache()
|
| 75 |
import random
|
| 76 |
seed = int(random.randint(0, 2147483647))
|
| 77 |
target_embed = clip_model.encode_text(clip.tokenize(prompt).to('cuda')).float()#.cuda()
|
| 78 |
|
| 79 |
if(clip_guided):
|
| 80 |
+
n_images = 1
|
| 81 |
steps = steps*5
|
| 82 |
clip_guidance_scale = weight*100
|
| 83 |
prompts = [prompt]
|
|
|
|
| 113 |
clip_embed_in = torch.cat([torch.zeros_like(clip_embed_repeat), clip_embed_repeat])
|
| 114 |
v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
|
| 115 |
v = v_uncond + (v_cond - v_uncond) * weight
|
| 116 |
+
return v
|
|
|
|
| 117 |
def make_cond_model_fn(model, cond_fn):
|
| 118 |
def cond_model_fn(x, t, **extra_args):
|
| 119 |
with torch.enable_grad():
|
|
|
|
| 135 |
grad = -torch.autograd.grad(loss, x)[0]
|
| 136 |
return grad
|
| 137 |
|
|
|
|
|
|
|
| 138 |
torch.manual_seed(seed)
|
| 139 |
x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
|
| 140 |
t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
|
|
|
|
| 141 |
if model.min_t == 0:
|
| 142 |
step_list = utils.get_spliced_ddpm_cosine_schedule(t)
|
| 143 |
else:
|
|
|
|
| 164 |
gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=40,maximum=80,minimum=1,step=1),
|
| 165 |
gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1, step=1),
|
| 166 |
gr.inputs.Slider(label="Weight - how closely the image should resemble the prompt", default=5, maximum=15, minimum=0, step=1),
|
| 167 |
+
gr.inputs.Checkbox(label="CLIP Guided - improves coherence with complex prompts, makes it slower (with CLIP Guidance only one image is generated)"),
|
| 168 |
],
|
| 169 |
outputs=gallery,
|
| 170 |
title="Generate images from text with V-Diffusion",
|
| 171 |
description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/crowsonkb/v-diffusion-pytorch' target='_blank'>V-Diffusion</a> is diffusion text-to-image model created by <a href='https://twitter.com/RiversHaveWings' target='_blank'>Katherine Crowson</a> and <a href='https://twitter.com/jd_pressman'>JDP</a>, trained on the <a href='https://github.com/google-research-datasets/conceptual-12m'>CC12M dataset</a>. The UI to the model was assembled by <a style='color: rgb(99, 102, 241);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a>, keep up with the <a style='color: rgb(99, 102, 241);' href='https://multimodal.art/news' target='_blank'>latest multimodal ai art news here</a> and consider <a style='color: rgb(99, 102, 241);' href='https://www.patreon.com/multimodalart' target='_blank'>supporting us on Patreon</a></div>",
|
|
|
|
| 172 |
)
|
| 173 |
iface.launch(enable_queue=True)
|