Spaces:

lorebianchi98
/

NoctOWL

Sleeping

File size: 6,401 Bytes

import torch
import gradio as gr
from transformers import Owlv2Processor, Owlv2ForObjectDetection
import os
import torchvision
import shutil

# --- Setup ---
# Clean caches each restart (helps avoid 50GB limit)
for cache_dir in [
    os.path.expanduser("~/.cache/huggingface"),
    os.path.expanduser("~/.cache/torch"),
]:
    shutil.rmtree(cache_dir, ignore_errors=True)

# Force Hugging Face cache to /tmp (ephemeral)
os.environ["HF_HUB_CACHE"] = "/tmp/hf_cache"
os.makedirs(os.environ["HF_HUB_CACHE"], exist_ok=True)

# Gradio temp folder
os.environ["GRADIO_TEMP_DIR"] = "tmp"
os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)

# Handle ZeroGPU safely for local debugging
try:
    import spaces
except ImportError:
    class spaces:
        def GPU(*args, **kwargs):
            def decorator(fn): return fn
            return decorator

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Lazy Model Loader ---
MODELS = {}

def get_model(selected_model):
    """Load model + processor on demand and cache in memory."""
    if selected_model in MODELS:
        return MODELS[selected_model]

    print(f"Loading {selected_model}...")

    if selected_model == "NoctOWLv2-Base":
        model = Owlv2ForObjectDetection.from_pretrained(
            "lorebianchi98/NoctOWLv2-base-patch16"
        ).to(device)
        processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")

    elif selected_model == "NoctOWLv2-Large":
        model = Owlv2ForObjectDetection.from_pretrained(
            "lorebianchi98/NoctOWLv2-large-patch14"
        ).to(device)
        processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14")

    else:
        raise gr.Error(f"Unknown model: {selected_model}")

    # Cache in memory so re-selections don't re-load from disk
    MODELS[selected_model] = (model, processor)
    return model, processor


# --- Inference Function ---
@spaces.GPU(duration=120)
def query_image(img, text_queries, score_threshold, selected_model):
    if img is None:
        raise gr.Error("Please upload or select an example image first.")
    if not text_queries.strip():
        raise gr.Error("Please enter at least one text query.")
    if selected_model is None or selected_model == "":
        raise gr.Error("Please select a model before running inference.")

    model, processor = get_model(selected_model)
    model = model.to(device)

    # Prepare text
    text_queries = [f"a {t.strip()}" for t in text_queries.split(",") if t.strip()]
    if not text_queries:
        raise gr.Error("No valid queries found. Please check your input text.")

    # Preprocess
    size = max(img.shape[:2])
    target_sizes = torch.Tensor([[size, size]])
    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)

    # Inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Postprocess
    outputs.logits = outputs.logits.cpu()
    outputs.pred_boxes = outputs.pred_boxes.cpu()
    results = processor.post_process_object_detection(
        outputs=outputs, target_sizes=target_sizes, threshold=score_threshold
    )

    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]

    # Non-Maximum Suppression
    keep = torchvision.ops.nms(boxes, scores, iou_threshold=0.5)
    boxes, scores, labels = boxes[keep], scores[keep], labels[keep]

    # Format output
    result_labels = []
    for box, score, label in zip(boxes, scores, labels):
        if score < score_threshold:
            continue
        box = [int(i) for i in box.tolist()]
        result_labels.append((box, f"{text_queries[label.item()]} ({score:.2f})"))

    return img, result_labels


# --- Interface Description ---
description = """
# 🦉 **NoctOWLv2: Fine-Grained Open-Vocabulary Object Detection**

**NoctOWL** (***N***ot **o**nly **c**oarse-**t**ext **OWL**) extends **OWL-ViT** and **OWLv2** for **Fine-Grained Open-Vocabulary Detection (FG-OVD)**.  
It can recognize subtle object differences such as **color, texture, and material**, while retaining strong coarse-grained detection abilities.

**Available Models:**
- 🧩 **NoctOWLv2-Base** — Smaller and faster.
- 🧠 **NoctOWLv2-Large** — More accurate, higher capacity.

📘 [Training & evaluation code](https://github.com/lorebianchi98/FG-OVD/NoctOWL)
"""

# --- Create Interface Layout ---
with gr.Blocks(title="NoctOWLv2 — Fine-Grained Zero-Shot Object Detection") as demo:
    gr.Markdown(description)

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(label="Input Image")

            text_queries = gr.Textbox(
                label="Text Queries (comma-separated)",
                placeholder="e.g., red shoes, striped shirt, yellow ball"
            )

            score_threshold = gr.Slider(
                0, 1, value=0.1, step=0.01, label="Score Threshold"
            )

            model_dropdown = gr.Dropdown(
                choices=["NoctOWLv2-Base", "NoctOWLv2-Large"],
                label="Select Model",
                value=None,
                info="Select which model to use for detection",
            )

            run_button = gr.Button("🚀 Run Detection", interactive=False)

        with gr.Column():
            output_image = gr.AnnotatedImage(label="Detected Objects")

    # --- Enable / Disable Run Button ---
    def toggle_button(model, text):
        return gr.update(interactive=bool(model and text.strip()))

    model_dropdown.change(
        fn=toggle_button,
        inputs=[model_dropdown, text_queries],
        outputs=run_button,
    )

    text_queries.change(
        fn=toggle_button,
        inputs=[model_dropdown, text_queries],
        outputs=run_button,
    )

    # --- Connect Button to Inference ---
    run_button.click(
        fn=query_image,
        inputs=[input_image, text_queries, score_threshold, model_dropdown],
        outputs=output_image,
    )

    # --- Example Images ---
    gr.Examples(
        examples=[
            ["assets/desciglio.jpg", "striped football shirt, plain red football shirt, yellow shoes, red shoes", 0.07],
            ["assets/pool.jpg", "white ball, blue ball, black ball, yellow ball", 0.1],
            ["assets/patio.jpg", "ceramic mug, glass mug, pink flowers, blue flowers", 0.09],
        ],
        inputs=[input_image, text_queries, score_threshold],
    )

demo.launch()