flexthink
/

discrete_wavlm_spk_rec_ecapatdn_lite

@@ -1,6 +1,8 @@
 import torch
 from speechbrain.inference.interfaces import Pretrained
 class AttentionMLP(torch.nn.Module):
     def __init__(self, input_dim, hidden_dim):
         super(AttentionMLP, self).__init__()
@@ -32,8 +34,11 @@ class Discrete_EmbeddingLayer(torch.nn.Module):
     init: boolean (default: False):
         If set to True, init the embedding with the tokenizer embedding otherwise init randomly.
     freeze: boolean (default: False)
-       If True, the embedding is frozen. If False, the model will be trained
         alongside with the rest of the pipeline.
     Example
     -------
@@ -62,6 +67,7 @@ class Discrete_EmbeddingLayer(torch.nn.Module):
         freeze=False,
         available_layers=None,
         layers=None,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -74,6 +80,8 @@ class Discrete_EmbeddingLayer(torch.nn.Module):
         self.layers = layers
         self.available_layers = available_layers
         self.offsets = self.build_offsets()
     def init_embedding(self, weights):
         with torch.no_grad():
@@ -111,6 +119,77 @@ class Discrete_EmbeddingLayer(torch.nn.Module):
             in_embs = self.embedding(in_tokens_offset.int())
             return in_embs
 class DiscreteSpkEmb(Pretrained):
     """A ready-to-use class for utterance-level classification (e.g, speaker-id,
@@ -168,5 +247,54 @@ class DiscreteSpkEmb(Pretrained):
         embeddings = self.mods.embedding_model(feats, length)
         return embeddings.squeeze(1)
     def forward(self, audio, length=None):
-        return self.encode_batch(audio, length)

 import torch
+import math
 from speechbrain.inference.interfaces import Pretrained
 class AttentionMLP(torch.nn.Module):
     def __init__(self, input_dim, hidden_dim):
         super(AttentionMLP, self).__init__()
     init: boolean (default: False):
         If set to True, init the embedding with the tokenizer embedding otherwise init randomly.
     freeze: boolean (default: False)
+        If True, the embedding is frozen. If False, the model will be trained
         alongside with the rest of the pipeline.
+    chunk_size: int
+        The size of lengthwize chunks use when evaluating via
+        Gumbel softmax
     Example
     -------
         freeze=False,
         available_layers=None,
         layers=None,
+        chunk_size=100,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
         self.layers = layers
         self.available_layers = available_layers
         self.offsets = self.build_offsets()
+        self.layer_embs = self.compute_layer_embs()
+        self.chunk_size = chunk_size
     def init_embedding(self, weights):
         with torch.no_grad():
             in_embs = self.embedding(in_tokens_offset.int())
             return in_embs
+    def compute_layer_embs(self):
+        weight = self.embedding.weight
+        # Compute offsets
+        layer_idx_map = {
+            layer: idx
+            for idx, layer in enumerate(self.available_layers)
+        }
+        layer_idx = [
+            layer_idx_map[layer]
+            for layer in self.layers
+        ]
+        offsets = [
+            idx * self.vocab_size
+            for idx in layer_idx
+        ]
+        layer_embs = torch.stack([
+            weight[offset:offset + self.vocab_size]
+            for offset in offsets
+        ])
+        # To (Batch x Length x Emb)
+        layer_embs = layer_embs.unsqueeze(0).unsqueeze(0)
+        return layer_embs
+    def encode_logits(self, logits, length=None):
+        """Computes waveforms from a batch of discrete units
+        Arguments
+        ---------
+        units: torch.tensor
+            Batch of discrete unit logits [batch, length, head, token]
+            or tokens [batch, length, head]
+        spk: torch.tensor
+            Batch of speaker embeddings [batch, spk_dim]
+        Returns
+        -------
+        waveforms: torch.tensor
+            Batch of mel-waveforms [batch, 1, time]
+        """
+        # Convert logits to one-hot representations
+        # without losing the gradient
+        units_gumbel = torch.nn.functional.gumbel_softmax(
+            logits,
+            hard=False,
+            dim=-1
+        )
+        # Straight-through trick
+        _, argmax_idx = logits.max(dim=-1, keepdim=True)
+        units_ref = torch.zeros_like(logits).scatter_(
+            dim=-1, index=argmax_idx, src=torch.ones_like(logits)
+        )
+        units_hard = units_ref - units_gumbel.detach() + units_gumbel
+        # Sum over embeddings for each layer
+        units_hard_chunked = units_hard.chunk(
+            math.ceil(units_hard.size(1) / self.chunk_size),
+            dim=1
+        )
+        emb = torch.cat(
+            [
+                (self.layer_embs * units_hard_chunk.unsqueeze(-1)).sum(-2)
+                for units_hard_chunk in units_hard_chunked
+            ],
+            dim=1
+        )
+        return emb
 class DiscreteSpkEmb(Pretrained):
     """A ready-to-use class for utterance-level classification (e.g, speaker-id,
         embeddings = self.mods.embedding_model(feats, length)
         return embeddings.squeeze(1)
+    def encode_logits(self, logits, length=None):
+        """Encodes the input audio logits into a single vector embedding.
+        Arguments
+        ---------
+        audio : torch.tensor
+            Batch of tokenized audio [batch, time, heads]
+        length : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        embeddings = self.mods.discrete_embedding_layer.encode_logits(logits)
+        att_w = self.mods.attention_mlp(embeddings)
+        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
+        embeddings = self.mods.embedding_model(feats, length)
+        return embeddings.squeeze(1)
     def forward(self, audio, length=None):
+        """Encodes the input audio into a single vector embedding.
+        The waveforms should already be in the model's desired format.
+        Arguments
+        ---------
+        audio : torch.tensor
+            Batch of tokenized audio [batch, time, heads]
+            or logits [batch, time, heads, tokens]
+        length : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        audio_dim = audio.dim()
+        if audio_dim == 3:
+            embeddings = self.encode_batch(audio, length)
+        elif audio_dim == 4:
+            embeddings = self.encode_logits(audio, length)
+        else:
+            raise ValueError("Unsupported audio shape {audio.shape}")
+        return embeddings