Spaces:

yifehuang97
/

CountEx

Sleeping

App Files Files Community

yifehuang97 commited on 1 day ago

Commit

3bef090

1 Parent(s): 97e2fd4

(feat) llm parse

Browse files

Files changed (1) hide show

app.py +325 -157

app.py CHANGED Viewed

@@ -1,16 +1,86 @@
 import os
 import gradio as gr
 import torch
 from PIL import Image, ImageDraw
 from transformers import GroundingDinoProcessor
 from hf_model import CountEX
 from utils import post_process_grounded_object_detection, post_process_grounded_object_detection_with_queries
 # Global variables for model and processor
 model = None
 processor = None
 device = None
 def load_model():
     """Load model and processor once at startup"""
@@ -34,49 +104,6 @@ def load_model():
 import numpy as np
-def filter_points_by_negative(points, neg_points, image_size, pixel_threshold=5):
-    """
-    Filter out positive points that are too close to any negative point.
-    Args:
-        points: List of [x, y] positive points (normalized coordinates, 0-1)
-        neg_points: List of [x, y] negative points (normalized coordinates, 0-1)
-        image_size: Tuple of (width, height) in pixels
-        pixel_threshold: Minimum distance threshold in pixels
-    Returns:
-        filtered_points: List of points that are far enough from all negative points
-        filtered_indices: Indices of the kept points in the original list
-    """
-    if not neg_points or not points:
-        return points, list(range(len(points)))
-    width, height = image_size
-    points_arr = np.array(points)  # (N, 2) normalized
-    neg_points_arr = np.array(neg_points)  # (M, 2) normalized
-    # Convert to pixel coordinates
-    points_pixel = points_arr * np.array([width, height])  # (N, 2)
-    neg_points_pixel = neg_points_arr * np.array([width, height])  # (M, 2)
-    # Compute pairwise distances in pixels: (N, M)
-    diff = points_pixel[:, None, :] - neg_points_pixel[None, :, :]
-    distances = np.linalg.norm(diff, axis=-1)  # (N, M)
-    # Find minimum distance to any negative point for each positive point
-    min_distances = distances.min(axis=1)  # (N,)
-    # Keep points where min distance > threshold
-    keep_mask = min_distances > pixel_threshold
-    filtered_points = points_arr[keep_mask].tolist()
-    filtered_indices = np.where(keep_mask)[0].tolist()
-    return filtered_points, filtered_indices
-import numpy as np
 def discriminative_point_suppression(
     points,
@@ -166,35 +193,166 @@ def discriminative_point_suppression(
     return filtered_points, filtered_indices, suppression_info
-def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
     """
     Main inference function for counting objects
     Args:
         image: Input PIL Image
-        pos_caption: Positive prompt (objects to count)
-        neg_caption: Negative prompt (objects to exclude)
         box_threshold: Detection confidence threshold
         point_radius: Radius of visualization points
         point_color: Color of visualization points
     Returns:
-        Annotated image and count
     """
     global model, processor, device
     if model is None:
         load_model()
     # Ensure image is RGB
     if image.mode != "RGB":
         image = image.convert("RGB")
     # Ensure captions end with period
-    if not pos_caption.endswith('.'):
         pos_caption = pos_caption + '.'
     if neg_caption and not neg_caption.endswith('.'):
         neg_caption = neg_caption + '.'
     # Process positive caption
     pos_inputs = processor(
@@ -206,12 +364,10 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
     pos_inputs = pos_inputs.to(device)
     pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)
-    # Process negative caption if provided
-    use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.')
     if not use_neg:
-        # print('neg_caption: ', neg_caption)
         neg_caption = "None."
     neg_inputs = processor(
         images=image,
@@ -229,31 +385,12 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
     pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
     pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
     pos_inputs['use_neg'] = True
-    # else:
-    #     neg_caption = "None."
-    #     neg_inputs = processor(
-    #         images=image,
-    #         text=neg_caption,
-    #         return_tensors="pt",
-    #         padding=True
-    #     )
-    #     neg_inputs = {k: v.to(device) for k, v in neg_inputs.items()}
-    #     neg_inputs['pixel_values'] = neg_inputs['pixel_values'].to(torch.bfloat16)
-    #     # Add negative inputs to positive inputs dict
-    #     pos_inputs['neg_token_type_ids'] = neg_inputs['token_type_ids']
-    #     pos_inputs['neg_attention_mask'] = neg_inputs['attention_mask']
-    #     pos_inputs['neg_pixel_mask'] = neg_inputs['pixel_mask']
-    #     pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
-    #     pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
-    #     pos_inputs['use_neg'] = False
     # Run inference
     with torch.no_grad():
         outputs = model(**pos_inputs)
     # Post-process outputs
-    # positive prediction
     outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
     outputs["pred_logits"] = outputs["logits"]
@@ -270,7 +407,9 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
     boxes = [box.tolist() for box in boxes]
     points = [[box[0], box[1]] for box in boxes]
-    # negative prediction
     if "neg_pred_boxes" in outputs and "neg_logits" in outputs:
         neg_outputs = outputs.copy()
         neg_outputs["pred_boxes"] = outputs["neg_pred_boxes"]
@@ -283,31 +422,25 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
-    pos_queries = results["queries"]
-    neg_queries = neg_results["queries"]
-    pos_queries = pos_queries.cpu().numpy()
-    neg_queries = neg_queries.cpu().numpy()
     img_size = image.size
-    # filtered_points, kept_indices = filter_points_by_negative(
-    #     points,
-    #     neg_points,
-    #     image_size=img_size,
-    #     pixel_threshold=5
-    # )
-    filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
-        points,
-        neg_points,
-        pos_queries,
-        neg_queries,
-        image_size=img_size,
-        pixel_threshold=5,
-        similarity_threshold=0.3,
-    )
-    filtered_boxes = [boxes[i] for i in kept_indices]
-    if "scores" in results:
-        filtered_scores = [results["scores"][i].item() for i in kept_indices]
     points = filtered_points
     boxes = filtered_boxes
@@ -324,18 +457,10 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
             [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
             fill=point_color
         )
-    # for point in neg_points:
-    #     x = point[0] * img_w
-    #     y = point[1] * img_h
-    #     draw.ellipse(
-    #         [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
-    #         fill="red"
-    #     )
     count = len(points)
-    return img_draw, f"Count: {count}"
 # Create Gradio interface
@@ -343,76 +468,119 @@ def create_demo():
     with gr.Blocks(title="CountEx: Discriminative Visual Counting") as demo:
         gr.Markdown("""
         # CountEx: Fine-Grained Counting via Exemplars and Exclusion
-        Count specific objects in images using positive and negative text prompts.
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 input_image = gr.Image(type="pil", label="Input Image")
-                pos_caption = gr.Textbox(
-                    label="Positive Prompt",
-                    placeholder="e.g., Green Apple",
-                    value="Pos Caption Here."
-                )
-                neg_caption = gr.Textbox(
-                    label="Negative Prompt (optional)",
-                    placeholder="e.g., Red Apple",
-                    value="None."
-                )
-                box_threshold = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.42,
-                    step=0.01,
-                    label="Detection Threshold (0.42 = use model default)"
-                )
-                point_radius = gr.Slider(
-                    minimum=1,
-                    maximum=20,
-                    value=5,
-                    step=1,
-                    label="Point Radius"
-                )
-                point_color = gr.Dropdown(
-                    choices=["blue", "red", "green", "yellow", "cyan", "magenta", "white"],
-                    value="blue",
-                    label="Point Color"
-                )
-                submit_btn = gr.Button("Count Objects", variant="primary")
             with gr.Column(scale=1):
                 output_image = gr.Image(type="pil", label="Result")
                 count_output = gr.Textbox(label="Count Result")
-        # Example images
-        # ["examples/in_the_wild.jpg", "Green plastic cup.", "Blue plastic cup."],
         gr.Examples(
             examples=[
-                ["examples/apples.png", "apple.", "Green apple."],
-                ["examples/apple.jpg", "apple.", "red apple."],
-                ["examples/black_beans.jpg", "Black bean.", "Soy bean."],
-                ["examples/candy.jpg", "Brown coffee candy.", "Black coffee candy."],
-                ["examples/strawberry.jpg", "strawberry and blueberry.", "strawberry."],
-                ["examples/strawberry2.jpg", "strawberry and blueberry.", "strawberry."],
-                ["examples/women.jpg", "person.", "woman."],
-                ["examples/boat-1.jpg", "boat.", "blue boat."],
             ],
-            inputs=[input_image, pos_caption, neg_caption],
-            outputs=[output_image, count_output],
             fn=count_objects,
             cache_examples=False,
         )
         submit_btn.click(
-            fn=count_objects,
-            inputs=[input_image, pos_caption, neg_caption, box_threshold, point_radius, point_color],
-            outputs=[output_image, count_output]
         )
     return demo

 import os
+import json
 import gradio as gr
 import torch
 from PIL import Image, ImageDraw
 from transformers import GroundingDinoProcessor
 from hf_model import CountEX
 from utils import post_process_grounded_object_detection, post_process_grounded_object_detection_with_queries
+import google.generativeai as genai
 # Global variables for model and processor
 model = None
 processor = None
 device = None
+# Configure Gemini
+genai.configure(api_key='AIzaSyAoQcUhn_KwOWvjdVqJ1kEaT0zBcnAKppo')
+gemini_model = genai.GenerativeModel("gemini-2.0-flash")
+PARSING_PROMPT = """Parse the user's counting instruction into two lists:
+- A (include): objects to count
+- B (exclude): objects to exclude from counting
+Rules:
+1. Split on "and", "or", and commas
+2. Reattach shared head nouns (e.g., "red and black beans" → "red beans", "black beans")
+3. Remove from B items that are equivalent to A (synonyms/variants/abbreviations)
+4. Remove from B items that are more specific than A
+5. If B is more general than A but shares head noun, rewrite B to specific non-overlapping forms
+Examples:
+- "Count green apples, not red apples" → A: ["green apples"], B: ["red apples"]
+- "Count apples, not green apples" → A: ["apples"], B: []
+- "Count green apples, not apples" → A: ["green apples"], B: ["non-green apples"]
+- "Count fries, not chips" → A: ["fries"], B: []
+- "Count black beans, not poker chips" → A: ["black beans"], B: ["poker chips"]
+User instruction: {instruction}
+Respond ONLY with a JSON object in this exact format, no other text:
+{{"A": ["item1", "item2"], "B": ["item3"]}}
+"""
+def parse_counting_instruction(instruction: str) -> tuple[str, str]:
+    """
+    Parse natural language counting instruction using Gemini 2.0 Flash.
+    Args:
+        instruction: Natural language instruction like "count apples, not green apples"
+    Returns:
+        tuple: (positive_caption, negative_caption)
+    """
+    try:
+        prompt = PARSING_PROMPT.format(instruction=instruction)
+        response = gemini_model.generate_content(prompt)
+        response_text = response.text.strip()
+        # Clean up response - remove markdown code blocks if present
+        if response_text.startswith("```"):
+            response_text = response_text.split("```")[1]
+            if response_text.startswith("json"):
+                response_text = response_text[4:]
+        response_text = response_text.strip()
+        result = json.loads(response_text)
+        # Convert lists to caption strings
+        pos_items = result.get("A", [])
+        neg_items = result.get("B", [])
+        # Join items with " and " and add period
+        pos_caption = " and ".join(pos_items) + "." if pos_items else ""
+        neg_caption = " and ".join(neg_items) + "." if neg_items else "None."
+        return pos_caption, neg_caption
+    except Exception as e:
+        print(f"Error parsing instruction: {e}")
+        # Fallback: treat entire instruction as positive caption
+        return instruction.strip() + ".", "None."
 def load_model():
     """Load model and processor once at startup"""
 import numpy as np
 def discriminative_point_suppression(
     points,
     return filtered_points, filtered_indices, suppression_info
+def count_objects(image, instruction, box_threshold, point_radius, point_color):
     """
     Main inference function for counting objects
     Args:
         image: Input PIL Image
+        instruction: Natural language instruction (e.g., "count apples, not green apples")
         box_threshold: Detection confidence threshold
         point_radius: Radius of visualization points
         point_color: Color of visualization points
     Returns:
+        Annotated image, count, and parsed captions
     """
     global model, processor, device
     if model is None:
         load_model()
+    # Parse instruction using Gemini
+    pos_caption, neg_caption = parse_counting_instruction(instruction)
+    parsed_info = f"Positive: {pos_caption}\nNegative: {neg_caption}"
     # Ensure image is RGB
     if image.mode != "RGB":
         image = image.convert("RGB")
+    # Process positive caption
+    pos_inputs = processor(
+        images=image,
+        text=pos_caption,
+        return_tensors="pt",
+        padding=True
+    )
+    pos_inputs = pos_inputs.to(device)
+    pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)
+    # Process negative caption
+    use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.' and neg_caption != 'None.')
+    if not use_neg:
+        neg_caption = "None."
+    neg_inputs = processor(
+        images=image,
+        text=neg_caption,
+        return_tensors="pt",
+        padding=True
+    )
+    neg_inputs = {k: v.to(device) for k, v in neg_inputs.items()}
+    neg_inputs['pixel_values'] = neg_inputs['pixel_values'].to(torch.bfloat16)
+    # Add negative inputs to positive inputs dict
+    pos_inputs['neg_token_type_ids'] = neg_inputs['token_type_ids']
+    pos_inputs['neg_attention_mask'] = neg_inputs['attention_mask']
+    pos_inputs['neg_pixel_mask'] = neg_inputs['pixel_mask']
+    pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
+    pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
+    pos_inputs['use_neg'] = True
+    # Run inference
+    with torch.no_grad():
+        outputs = model(**pos_inputs)
+    # Post-process outputs
+    outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
+    outputs["pred_logits"] = outputs["logits"]
+    threshold = box_threshold if box_threshold > 0 else model.box_threshold
+    pos_queries = outputs["pos_queries"].squeeze(0).float()
+    neg_queries = outputs["neg_queries"].squeeze(0).float()
+    pos_queries = pos_queries[-1].squeeze(0)
+    neg_queries = neg_queries[-1].squeeze(0)
+    pos_queries = pos_queries.unsqueeze(0)
+    neg_queries = neg_queries.unsqueeze(0)
+    results = post_process_grounded_object_detection_with_queries(outputs, pos_queries, box_threshold=threshold)[0]
+    boxes = results["boxes"]
+    boxes = [box.tolist() for box in boxes]
+    points = [[box[0], box[1]] for box in boxes]
+    # Negative prediction
+    neg_points = []
+    neg_results = None
+    if "neg_pred_boxes" in outputs and "neg_logits" in outputs:
+        neg_outputs = outputs.copy()
+        neg_outputs["pred_boxes"] = outputs["neg_pred_boxes"]
+        neg_outputs["logits"] = outputs["neg_logits"]
+        neg_outputs["pred_points"] = outputs["neg_pred_boxes"][:, :, :2]
+        neg_outputs["pred_logits"] = outputs["neg_logits"]
+        neg_results = post_process_grounded_object_detection_with_queries(neg_outputs, neg_queries, box_threshold=threshold)[0]
+        neg_boxes = neg_results["boxes"]
+        neg_boxes = [box.tolist() for box in neg_boxes]
+        neg_points = [[box[0], box[1]] for box in neg_boxes]
+    pos_queries_np = results["queries"].cpu().numpy()
+    neg_queries_np = neg_results["queries"].cpu().numpy() if neg_results else np.array([])
+    img_size = image.size
+    if len(neg_points) > 0 and len(neg_queries_np) > 0:
+        filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
+            points,
+            neg_points,
+            pos_queries_np,
+            neg_queries_np,
+            image_size=img_size,
+            pixel_threshold=5,
+            similarity_threshold=0.3,
+        )
+        filtered_boxes = [boxes[i] for i in kept_indices]
+    else:
+        filtered_points = points
+        filtered_boxes = boxes
+    points = filtered_points
+    boxes = filtered_boxes
+    # Visualize results
+    img_w, img_h = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+    for point in points:
+        x = point[0] * img_w
+        y = point[1] * img_h
+        draw.ellipse(
+            [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
+            fill=point_color
+        )
+    count = len(points)
+    return img_draw, f"Count: {count}", parsed_info
+def count_objects_manual(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
+    """
+    Manual mode: directly use provided positive and negative captions.
+    """
+    global model, processor, device
+    if model is None:
+        load_model()
     # Ensure captions end with period
+    if pos_caption and not pos_caption.endswith('.'):
         pos_caption = pos_caption + '.'
     if neg_caption and not neg_caption.endswith('.'):
         neg_caption = neg_caption + '.'
+    if not neg_caption or neg_caption.strip() == '':
+        neg_caption = "None."
+    parsed_info = f"Positive: {pos_caption}\nNegative: {neg_caption}"
+    # Ensure image is RGB
+    if image.mode != "RGB":
+        image = image.convert("RGB")
     # Process positive caption
     pos_inputs = processor(
     pos_inputs = pos_inputs.to(device)
     pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)
+    # Process negative caption
+    use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.' and neg_caption != 'None.')
     if not use_neg:
         neg_caption = "None."
     neg_inputs = processor(
         images=image,
     pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
     pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
     pos_inputs['use_neg'] = True
     # Run inference
     with torch.no_grad():
         outputs = model(**pos_inputs)
     # Post-process outputs
     outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
     outputs["pred_logits"] = outputs["logits"]
     boxes = [box.tolist() for box in boxes]
     points = [[box[0], box[1]] for box in boxes]
+    # Negative prediction
+    neg_points = []
+    neg_results = None
     if "neg_pred_boxes" in outputs and "neg_logits" in outputs:
         neg_outputs = outputs.copy()
         neg_outputs["pred_boxes"] = outputs["neg_pred_boxes"]
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
+    pos_queries_np = results["queries"].cpu().numpy()
+    neg_queries_np = neg_results["queries"].cpu().numpy() if neg_results else np.array([])
     img_size = image.size
+    if len(neg_points) > 0 and len(neg_queries_np) > 0:
+        filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
+            points,
+            neg_points,
+            pos_queries_np,
+            neg_queries_np,
+            image_size=img_size,
+            pixel_threshold=5,
+            similarity_threshold=0.3,
+        )
+        filtered_boxes = [boxes[i] for i in kept_indices]
+    else:
+        filtered_points = points
+        filtered_boxes = boxes
     points = filtered_points
     boxes = filtered_boxes
             [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
             fill=point_color
         )
     count = len(points)
+    return img_draw, f"Count: {count}", parsed_info
 # Create Gradio interface
     with gr.Blocks(title="CountEx: Discriminative Visual Counting") as demo:
         gr.Markdown("""
         # CountEx: Fine-Grained Counting via Exemplars and Exclusion
+        Count specific objects in images using text prompts with exclusion capability.
         """)
+        # State to track current input mode
+        current_mode = gr.State(value="natural_language")
         with gr.Row():
+            # Left column - Input
             with gr.Column(scale=1):
                 input_image = gr.Image(type="pil", label="Input Image")
+                with gr.Tabs() as input_tabs:
+                    # Tab 1: Natural Language Input
+                    with gr.TabItem("Natural Language", id=0) as tab_nl:
+                        instruction = gr.Textbox(
+                            label="Counting Instruction",
+                            placeholder="e.g., Count apples, not green apples",
+                            value="Count apples, not green apples",
+                            lines=2
+                        )
+                        gr.Markdown("""
+                        **Examples:**
+                        - "Count apples, not green apples"
+                        - "Count red and black beans, exclude white beans"
+                        - "Count people, not women"
+                        """)
+                    # Tab 2: Manual Input
+                    with gr.TabItem("Manual Input", id=1) as tab_manual:
+                        pos_caption = gr.Textbox(
+                            label="Positive Prompt (objects to count)",
+                            placeholder="e.g., apple",
+                            value="apple."
+                        )
+                        neg_caption = gr.Textbox(
+                            label="Negative Prompt (objects to exclude)",
+                            placeholder="e.g., green apple",
+                            value="None."
+                        )
+                # Single submit button outside tabs
+                submit_btn = gr.Button("Count Objects", variant="primary", size="lg")
+                # Shared settings
+                with gr.Accordion("Advanced Settings", open=False):
+                    box_threshold = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.42,
+                        step=0.01,
+                        label="Detection Threshold"
+                    )
+                    point_radius = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1,
+                        label="Point Radius"
+                    )
+                    point_color = gr.Dropdown(
+                        choices=["blue", "red", "green", "yellow", "cyan", "magenta", "white"],
+                        value="blue",
+                        label="Point Color"
+                    )
+            # Right column - Output
             with gr.Column(scale=1):
                 output_image = gr.Image(type="pil", label="Result")
                 count_output = gr.Textbox(label="Count Result")
+                parsed_output = gr.Textbox(label="Parsed Captions", lines=2)
+        # Examples for Natural Language mode
+        gr.Markdown("### Examples (Natural Language)")
         gr.Examples(
             examples=[
+                ["examples/apples.png", "Count apples, exclude green apples"],
+                ["examples/apple.jpg", "Count apples, not red apples"],
+                ["examples/black_beans.jpg", "Count black beans, not soy beans"],
+                ["examples/candy.jpg", "Count brown coffee candy, exclude black coffee candy"],
+                ["examples/strawberry.jpg", "Count blueberries"],
+                ["examples/strawberry2.jpg", "Count blueberries"],
+                ["examples/women.jpg", "Count people, not women"],
+                ["examples/boat-1.jpg", "Count boats, exclude blue boats"],
             ],
+            inputs=[input_image, instruction],
+            outputs=[output_image, count_output, parsed_output],
             fn=count_objects,
             cache_examples=False,
         )
+        # Update mode when tab changes
+        def set_mode_nl():
+            return "natural_language"
+        def set_mode_manual():
+            return "manual"
+        tab_nl.select(fn=set_mode_nl, outputs=[current_mode])
+        tab_manual.select(fn=set_mode_manual, outputs=[current_mode])
+        # Unified handler that routes based on mode
+        def handle_submit(mode, image, instr, pos_cap, neg_cap, threshold, radius, color):
+            if mode == "natural_language":
+                return count_objects(image, instr, threshold, radius, color)
+            else:
+                return count_objects_manual(image, pos_cap, neg_cap, threshold, radius, color)
+        # Single button click handler
         submit_btn.click(
+            fn=handle_submit,
+            inputs=[current_mode, input_image, instruction, pos_caption, neg_caption,
+                    box_threshold, point_radius, point_color],
+            outputs=[output_image, count_output, parsed_output]
         )
     return demo