Spaces:

yifehuang97
/

CountEx

Sleeping

App Files Files Community

yifehuang97 commited on 17 days ago

Commit

9115945

1 Parent(s): 288ef96

(feat) spatial semantic sup

Browse files

Files changed (3) hide show

app.py +128 -2
hf_model/CountEX.py +2 -0
hf_model/modeling_grounding_dino.py +2 -0

app.py CHANGED Viewed

@@ -75,6 +75,117 @@ def filter_points_by_negative(points, neg_points, image_size, pixel_threshold=5)
     return filtered_points, filtered_indices
 def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
     """
     Main inference function for counting objects
@@ -167,12 +278,27 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
     img_size = image.size
-    filtered_points, kept_indices = filter_points_by_negative(
         points,
         neg_points,
         image_size=img_size,
-        pixel_threshold=5
     )
     filtered_boxes = [boxes[i] for i in kept_indices]

     return filtered_points, filtered_indices
+def discriminative_point_suppression(
+    points,
+    neg_points,
+    pos_queries,
+    neg_queries,
+    image_size,
+    pixel_threshold=5,
+    similarity_threshold=0.5,
+    mode="and"
+):
+    """
+    Discriminative Point Suppression (DPS):
+    Suppress positive predictions that are both spatially close to
+    AND semantically similar with negative predictions.
+    Motivation: Spatial proximity alone may cause false suppression when
+    positive and negative queries represent different semantic concepts.
+    By jointly verifying spatial AND semantic alignment, we ensure
+    suppression only occurs for true conflicts.
+    Args:
+        points: List of [x, y] positive points (normalized, 0-1)
+        neg_points: List of [x, y] negative points (normalized, 0-1)
+        pos_queries: (N, D) query embeddings for positive predictions
+        neg_queries: (M, D) query embeddings for negative predictions
+        image_size: (width, height) in pixels
+        pixel_threshold: spatial distance threshold in pixels
+        similarity_threshold: cosine similarity threshold for semantic match
+        mode: "and" for hard joint condition, "weighted" for soft combination
+    Returns:
+        filtered_points: points after suppression
+        filtered_indices: indices of kept points
+        suppression_info: dict with detailed suppression decisions (for analysis)
+    """
+    if not neg_points or not points:
+        return points, list(range(len(points))), {}
+    width, height = image_size
+    N, M = len(points), len(neg_points)
+    # === Spatial Distance ===
+    points_arr = np.array(points) * np.array([width, height])  # (N, 2)
+    neg_points_arr = np.array(neg_points) * np.array([width, height])  # (M, 2)
+    spatial_dist = np.linalg.norm(
+        points_arr[:, None, :] - neg_points_arr[None, :, :], axis=-1
+    )  # (N, M)
+    # === Query Similarity (Cosine) ===
+    # Normalize queries
+    pos_q = pos_queries / (np.linalg.norm(pos_queries, axis=-1, keepdims=True) + 1e-8)
+    neg_q = neg_queries / (np.linalg.norm(neg_queries, axis=-1, keepdims=True) + 1e-8)
+    query_sim = np.dot(pos_q, neg_q.T)  # (N, M), range [-1, 1]
+    # === Joint Suppression Decision ===
+    if mode == "and":
+        # Hard condition: suppress only if BOTH spatially close AND semantically similar
+        spatial_close = spatial_dist < pixel_threshold  # (N, M)
+        semantic_similar = query_sim > similarity_threshold  # (N, M)
+        # A positive is suppressed if ANY negative satisfies both conditions
+        should_suppress = (spatial_close & semantic_similar).any(axis=1)  # (N,)
+    elif mode == "weighted":
+        # Soft combination: weighted score
+        # Convert distance to proximity score (0-1, higher = closer)
+        spatial_proximity = np.exp(-spatial_dist / pixel_threshold)  # (N, M)
+        # Normalize similarity to [0, 1]
+        semantic_score = (query_sim + 1) / 2  # (N, M)
+        # Combined suppression score
+        suppression_score = spatial_proximity * semantic_score  # (N, M)
+        max_suppression = suppression_score.max(axis=1)  # (N,)
+        should_suppress = max_suppression > similarity_threshold
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+    # === Filter ===
+    keep_mask = ~should_suppress
+    filtered_points = np.array(points)[keep_mask].tolist()
+    filtered_indices = np.where(keep_mask)[0].tolist()
+    # === Suppression Info (for analysis/visualization) ===
+    suppression_info = {
+        "spatial_dist": spatial_dist,
+        "query_similarity": query_sim,
+        "suppressed_indices": np.where(should_suppress)[0].tolist(),
+        "suppressed_reasons": []
+    }
+    # Record why each point was suppressed
+    for i in np.where(should_suppress)[0]:
+        if mode == "and":
+            matching_negs = np.where(spatial_close[i] & semantic_similar[i])[0]
+        else:
+            matching_negs = [suppression_score[i].argmax()]
+        suppression_info["suppressed_reasons"].append({
+            "pos_idx": int(i),
+            "matched_neg_idx": matching_negs.tolist() if isinstance(matching_negs, np.ndarray) else matching_negs,
+            "min_spatial_dist": float(spatial_dist[i].min()),
+            "max_query_sim": float(query_sim[i].max())
+        })
+    return filtered_points, filtered_indices, suppression_info
 def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
     """
     Main inference function for counting objects
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
+    pos_queries = outputs["pos_queries"].squeeze(0)
+    neg_queries = outputs["neg_queries"].squeeze(0)
+    pos_queries = pos_queries.cpu().numpy()
+    neg_queries = neg_queries.cpu().numpy()
     img_size = image.size
+    # filtered_points, kept_indices = filter_points_by_negative(
+    #     points,
+    #     neg_points,
+    #     image_size=img_size,
+    #     pixel_threshold=5
+    # )
+    filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
         points,
         neg_points,
+        pos_queries,
+        neg_queries,
         image_size=img_size,
+        pixel_threshold=5,
+        similarity_threshold=0.5,
+        mode="and"
     )
     filtered_boxes = [boxes[i] for i in kept_indices]

hf_model/CountEX.py CHANGED Viewed

@@ -578,6 +578,8 @@ class CountEX(GroundingDinoForObjectDetection):
             extra_logs=logs,
             neg_logits=neg_logits,
             neg_pred_boxes=neg_pred_boxes,
         )
         return dict_outputs

             extra_logs=logs,
             neg_logits=neg_logits,
             neg_pred_boxes=neg_pred_boxes,
+            pos_queries=hidden_states,
+            neg_queries=neg_hidden_states,
         )
         return dict_outputs

hf_model/modeling_grounding_dino.py CHANGED Viewed

@@ -373,6 +373,8 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     extra_logs: Optional[Dict] = None
     neg_logits: Optional[torch.FloatTensor] = None
     neg_pred_boxes: Optional[torch.FloatTensor] = None
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino

     extra_logs: Optional[Dict] = None
     neg_logits: Optional[torch.FloatTensor] = None
     neg_pred_boxes: Optional[torch.FloatTensor] = None
+    pos_queries: Optional[torch.FloatTensor] = None
+    neg_queries: Optional[torch.FloatTensor] = None
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino