Spaces:

taesiri
/

CLIPScore

Running on Zero

App Files Files Community

taesiri commited on Sep 2, 2024

Commit

61b7eee

verified ·

1 Parent(s): 73f9f45

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -15

app.py CHANGED Viewed

@@ -3,45 +3,81 @@ import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
 import spaces
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to("cuda")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
 @spaces.GPU
-def calculate_score(image, text):
     labels = text.split(";")
     labels = [l.strip() for l in labels]
     labels = list(filter(None, labels))
     if len(labels) == 0:
         return dict()
     inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
     inputs = {k: v.to("cuda") for k, v in inputs.items()}
-    outputs = model(**inputs)
-    logits_per_image = outputs.logits_per_image.detach().cpu().numpy()
     results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
     return results_dict
 with gr.Blocks() as demo:
-    gr.Markdown("# CLIP Score")
-    gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text")
     with gr.Row():
-        image_input = gr.Image()
         output_label = gr.Label()
-    text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
     image_input.change(
-        fn=calculate_score,
-        inputs=[image_input, text_input],
         outputs=output_label
     )
     text_input.submit(
-        fn=calculate_score,
-        inputs=[image_input, text_input],
         outputs=output_label
     )
@@ -50,10 +86,11 @@ with gr.Blocks() as demo:
             [
                 "cat.jpg",
                 "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
             ]
         ],
-        fn=calculate_score,
-        inputs=[image_input, text_input],
         outputs=output_label,
     )

 from transformers import CLIPProcessor, CLIPModel
 import spaces
+# Dictionary of available CLIP models with their image sizes
+CLIP_MODELS = {
+    "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
+    "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
+    "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
+    "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
+}
+# Initialize models and processors
+models = {}
+processors = {}
+for model_name, (model_path, _) in CLIP_MODELS.items():
+    models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
+    processors[model_name] = CLIPProcessor.from_pretrained(model_path)
 @spaces.GPU
+def calculate_score(image, text, model_name):
     labels = text.split(";")
     labels = [l.strip() for l in labels]
     labels = list(filter(None, labels))
     if len(labels) == 0:
         return dict()
+    model = models[model_name]
+    processor = processors[model_name]
+    # Get the correct image size for the model
+    _, image_size = CLIP_MODELS[model_name]
+    # Preprocess the image and text
     inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
     inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    # Calculate scores
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image.cpu().numpy()
     results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
     return results_dict
 with gr.Blocks() as demo:
+    gr.Markdown("# Multi-Model CLIP Score")
+    gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
     with gr.Row():
+        image_input = gr.Image(type="pil")
         output_label = gr.Label()
+    with gr.Row():
+        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
+        model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
+    def process_inputs(image, text, model_name):
+        if image is None or text.strip() == "":
+            return None
+        return calculate_score(image, text, model_name)
     image_input.change(
+        fn=process_inputs,
+        inputs=[image_input, text_input, model_dropdown],
         outputs=output_label
     )
     text_input.submit(
+        fn=process_inputs,
+        inputs=[image_input, text_input, model_dropdown],
+        outputs=output_label
+    )
+    model_dropdown.change(
+        fn=process_inputs,
+        inputs=[image_input, text_input, model_dropdown],
         outputs=output_label
     )
             [
                 "cat.jpg",
                 "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
+                "ViT-B/16"
             ]
         ],
+        fn=process_inputs,
+        inputs=[image_input, text_input, model_dropdown],
         outputs=output_label,
     )