Simplified model loading

Browse files

Files changed (15) hide show

.gitignore +1 -0
README.md +2 -2
config.json +6 -0
configuration_talk2dino.py +49 -0
hf_demo.ipynb +0 -0
modeling_talk2dino.py +42 -0
{hf_model → src}/__init__.py +0 -0
hf_model/talk2dino.py → src/dinotext.py +12 -43
{hf_model → src}/hooks.py +0 -0
{hf_model → src}/masker.py +2 -2
{hf_model → src}/model.py +1 -1
{hf_model → src}/modules.py +0 -0
{hf_model → src}/pamr.py +0 -0
{hf_model → src}/templates.py +0 -0
{hf_model → src}/us.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

README.md CHANGED Viewed

@@ -43,14 +43,14 @@ Open-Vocabulary Segmentation (OVS) aims at segmenting images from free-form text
 ### Mapping CLIP Text Embeddings to DINOv2 space with Talk2DINO
 We can use Talk2DINO to map CLIP text embeddings into the DINOv2 patch embedding space.
 ```python
-from hf_model.talk2dino import Talk2DINO
 from torchvision.io import read_image
 # Device setup
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Model Loading
-model = Talk2DINO.from_pretrained("lorebianchi98/Talk2DINO-ViTL").to(device).eval()
 # Embedding generation
 with torch.no_grad():

 ### Mapping CLIP Text Embeddings to DINOv2 space with Talk2DINO
 We can use Talk2DINO to map CLIP text embeddings into the DINOv2 patch embedding space.
 ```python
+from transformers import AutoModel
 from torchvision.io import read_image
 # Device setup
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Model Loading
+model = AutoModel.from_pretrained("lorebianchi98/Talk2DINO-ViTL").to(device).eval()
 # Embedding generation
 with torch.no_grad():

config.json CHANGED Viewed

@@ -1,4 +1,10 @@
 {
   "avg_self_attn_token": false,
   "clip_model_name": "ViT-B/16",
   "disentangled_self_attn_token": true,

 {
+  "architectures": ["Talk2DINO"],
+  "model_type": "talk2dino",
+  "auto_map": {
+    "AutoConfig": "configuration_talk2dino.Talk2DINOConfig",
+    "AutoModel": "modeling_talk2dino.Talk2DINO"
+  },
   "avg_self_attn_token": false,
   "clip_model_name": "ViT-B/16",
   "disentangled_self_attn_token": true,

configuration_talk2dino.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import PretrainedConfig
+class Talk2DINOConfig(PretrainedConfig):
+    model_type = "talk2dino"
+    def __init__(
+        self,
+        avg_self_attn_token=False,
+        clip_model_name="ViT-B/16",
+        disentangled_self_attn_token=True,
+        is_eval=True,
+        keep_cls=False,
+        keep_end_seq=False,
+        loss=None,
+        model_name="dinov2_vitb14_reg",
+        pre_trained=True,
+        proj_class="vitb_mlp_infonce",
+        proj_model="ProjectionLayer",
+        proj_name="vitb_mlp_infonce",
+        resize_dim=518,
+        type="DINOText",
+        unfreeze_last_image_layer=False,
+        unfreeze_last_text_layer=False,
+        use_avg_text_token=False,
+        with_bg_clean=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Store all parameters
+        self.avg_self_attn_token = avg_self_attn_token
+        self.clip_model_name = clip_model_name
+        self.disentangled_self_attn_token = disentangled_self_attn_token
+        self.is_eval = is_eval
+        self.keep_cls = keep_cls
+        self.keep_end_seq = keep_end_seq
+        self.loss = loss
+        self.model_name = model_name
+        self.pre_trained = pre_trained
+        self.proj_class = proj_class
+        self.proj_model = proj_model
+        self.proj_name = proj_name
+        self.resize_dim = resize_dim
+        self.type = type
+        self.unfreeze_last_image_layer = unfreeze_last_image_layer
+        self.unfreeze_last_text_layer = unfreeze_last_text_layer
+        self.use_avg_text_token = use_avg_text_token
+        self.with_bg_clean = with_bg_clean

hf_demo.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_talk2dino.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from src.dinotext import DINOText
+from transformers import PreTrainedModel
+from configuration_talk2dino import Talk2DINOConfig
+import clip
+import torch
+class Talk2DINO(DINOText, PreTrainedModel):
+    config_class = Talk2DINOConfig
+    def __init__(self, config: Talk2DINOConfig):
+        # Store the config
+        self.config = config
+        # Convert config to a dict (works for PretrainedConfig subclasses)
+        cfg_dict = config.to_dict()
+        # Initialize parent (DINOText) with unpacked kwargs
+        super().__init__(**cfg_dict)
+    def encode_text(self, texts):
+        """ texts: string or list of strings
+         returns: text embeddings (N, D) where N is the number of texts, D is the embedding dimension
+        """
+        text_tokens = clip.tokenize(texts).to(self.parameters().__next__().device)
+        txt_embed = self.clip_model.encode_text(text_tokens)
+        txt_embed = self.proj.project_clip_txt(txt_embed)
+        return txt_embed
+    def encode_image(self, images):
+        """ images: PIL image or list of PIL images
+         returns: image embeddings (N, L, D) where N is the number of images, L is the number of patches, D is the embedding dimension
+        """
+        if type(images) is not list:
+            images = [images]
+        img_preprocessed = [self.image_transforms(img).to(next(self.parameters()).device) for img in images]
+        img_preprocessed = torch.stack(img_preprocessed)
+        if 'dinov2' in self.model_name or 'dinov3' in self.model_name:
+            img_embed = self.model.forward_features(img_preprocessed)['x_norm_patchtokens']
+        elif 'mae' in self.model_name or 'clip' in self.model_name or 'dino' in self.model_name:
+            img_embed = self.model.forward_features(img_preprocessed)[:, 1:, :]
+        return img_embed

{hf_model → src}/__init__.py RENAMED Viewed

File without changes

hf_model/talk2dino.py → src/dinotext.py RENAMED Viewed

@@ -16,14 +16,14 @@ from transformers import BertModel, AutoTokenizer
 import torchvision.transforms as T
 import clip
 import importlib
-import hf_model.us as us
-from hf_model.pamr import PAMR
-from hf_model.masker import DINOTextMasker
-from hf_model.templates import get_template
-from hf_model.model import ProjectionLayer, VisualProjectionLayer, CLIPLastLayer, DoubleMLP
-from hf_model.hooks import average_text_tokens, get_vit_out, feats
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -45,7 +45,8 @@ class DINOText(nn.Module):
             self, model_name, resize_dim, clip_model_name, proj_class, proj_name, proj_model, avg_self_attn_token=False, disentangled_self_attn_token=True, loss=None, pre_trained=True,
             unfreeze_last_text_layer=False, unfreeze_last_image_layer=False, is_eval=True, use_avg_text_token=False, keep_cls=False, keep_end_seq=False, with_bg_clean=False, **kwargs
     ):
-        super().__init__()
         self.feats = {}
         self.model_name = model_name
         # loading the model
@@ -82,7 +83,7 @@ class DINOText(nn.Module):
             T.Normalize(mean, std),
         ])
-        self.model.to(device)
         self.model.requires_grad_(False)
         self.clip_model_name = clip_model_name
@@ -91,7 +92,7 @@ class DINOText(nn.Module):
             # load the corresponding wordtokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(self.clip_model_name)
         else:
-            self.clip_model, _ = clip.load(clip_model_name, device=device)
         self.clip_model.eval()
         self.clip_model.requires_grad_(False)
         if unfreeze_last_text_layer:
@@ -118,13 +119,11 @@ class DINOText(nn.Module):
             }
         self.proj = ProjectionLayer.from_config(config)
-        if type(self.proj) == CLIPLastLayer:
-            self.clip_model.transformer.resblocks[-2].register_forward_hook(self.get_clip_second_last_dense_out)
         # if pre_trained:
         #     self.proj.load_state_dict(torch.load(os.path.join("weights", f"{proj_name}.pth"), 'cpu'))
-        self.proj.to(device)
         self.masker = DINOTextMasker(similarity_type="cosine")
         self.masker = self.masker.eval()
@@ -166,12 +165,7 @@ class DINOText(nn.Module):
             return self_attn
     def encode_text(self, tokenized_texts):
-        if type(self.proj) == CLIPLastLayer:
-            self.clip_model.encode_text(tokenized_texts)
-            x = self.feats['clip_second_last_out']
-            x = x.to(dtype=torch.float32)
-        else:
-            x = self.clip_model.encode_text(tokenized_texts)
         return x
     def encode_image(self, images):
@@ -404,29 +398,4 @@ class DINOText(nn.Module):
         return mask_output
-from huggingface_hub import PyTorchModelHubMixin
-class Talk2DINO(DINOText, PyTorchModelHubMixin):
-    def encode_text(self, texts):
-        """ texts: string or list of strings
-         returns: text embeddings (N, D) where N is the number of texts, D is the embedding dimension
-        """
-        text_tokens = clip.tokenize(texts).to(self.parameters().__next__().device)
-        txt_embed = self.clip_model.encode_text(text_tokens)
-        txt_embed = self.proj.project_clip_txt(txt_embed)
-        return txt_embed
-    def encode_image(self, images):
-        """ images: PIL image or list of PIL images
-         returns: image embeddings (N, L, D) where N is the number of images, L is the number of patches, D is the embedding dimension
-        """
-        if type(images) is not list:
-            images = [images]
-        img_preprocessed = [self.image_transforms(img).to(next(self.parameters()).device) for img in images]
-        img_preprocessed = torch.stack(img_preprocessed)
-        if 'dinov2' in self.model_name or 'dinov3' in self.model_name:
-            img_embed = self.model.forward_features(img_preprocessed)['x_norm_patchtokens']
-        elif 'mae' in self.model_name or 'clip' in self.model_name or 'dino' in self.model_name:
-            img_embed = self.model.forward_features(img_preprocessed)[:, 1:, :]
-        return img_embed

 import torchvision.transforms as T
 import clip
 import importlib
+import src.us as us
+from src.pamr import PAMR
+from src.masker import DINOTextMasker
+from src.templates import get_template
+from src.model import ProjectionLayer, VisualProjectionLayer, CLIPLastLayer, DoubleMLP
+from src.hooks import average_text_tokens, get_vit_out, feats
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             self, model_name, resize_dim, clip_model_name, proj_class, proj_name, proj_model, avg_self_attn_token=False, disentangled_self_attn_token=True, loss=None, pre_trained=True,
             unfreeze_last_text_layer=False, unfreeze_last_image_layer=False, is_eval=True, use_avg_text_token=False, keep_cls=False, keep_end_seq=False, with_bg_clean=False, **kwargs
     ):
+        nn.Module.__init__(self)
         self.feats = {}
         self.model_name = model_name
         # loading the model
             T.Normalize(mean, std),
         ])
+        self.model
         self.model.requires_grad_(False)
         self.clip_model_name = clip_model_name
             # load the corresponding wordtokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(self.clip_model_name)
         else:
+            self.clip_model, _ = clip.load(clip_model_name, device='meta')
         self.clip_model.eval()
         self.clip_model.requires_grad_(False)
         if unfreeze_last_text_layer:
             }
         self.proj = ProjectionLayer.from_config(config)
         # if pre_trained:
         #     self.proj.load_state_dict(torch.load(os.path.join("weights", f"{proj_name}.pth"), 'cpu'))
+        self.proj
         self.masker = DINOTextMasker(similarity_type="cosine")
         self.masker = self.masker.eval()
             return self_attn
     def encode_text(self, tokenized_texts):
+        x = self.clip_model.encode_text(tokenized_texts)
         return x
     def encode_image(self, images):
         return mask_output

{hf_model → src}/hooks.py RENAMED Viewed

File without changes

{hf_model → src}/masker.py RENAMED Viewed

@@ -8,11 +8,11 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
-import hf_model.us as us
 from einops import rearrange, repeat
 # from models.dinotext.gumbel import gumbel_sigmoid
-from hf_model.modules import FeatureEncoder
 from omegaconf import OmegaConf

 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+import src.us as us
 from einops import rearrange, repeat
 # from models.dinotext.gumbel import gumbel_sigmoid
+from src.modules import FeatureEncoder
 from omegaconf import OmegaConf

{hf_model → src}/model.py RENAMED Viewed

@@ -4,7 +4,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from hf_model.hooks import get_self_attention, process_self_attention, feats
 class VisualProjectionLayer(nn.Module):
     """

 import torch.nn as nn
 import torch.nn.functional as F
+from src.hooks import get_self_attention, process_self_attention, feats
 class VisualProjectionLayer(nn.Module):
     """

{hf_model → src}/modules.py RENAMED Viewed

File without changes

{hf_model → src}/pamr.py RENAMED Viewed

File without changes

{hf_model → src}/templates.py RENAMED Viewed

File without changes

{hf_model → src}/us.py RENAMED Viewed

File without changes