Spaces:

shaun3141
/

caribbean-voices-hackathon

Sleeping

shaun3141 commited on 23 days ago

Commit

c0cd25b

1 Parent(s): 5c2e86a

Fix: Ensure generation_config and pad_token handling for Whisper training

- Copy generation_config from base model to prevent NoneType errors in Seq2SeqTrainer
- Set task='transcribe' and clear deprecated forced_decoder_ids
- Fix pad_token == eos_token issue by setting pad_token to unk_token
- Ensure pad_token_id is set in generation_config and model config
- Copy additional attributes (main_input_name, forced_decoder_ids, suppress_tokens) for compatibility

Files changed (3) hide show

owsm_model.py +56 -0
training/trainer.py +64 -5
training/whisper_trainer.py +85 -10

owsm_model.py CHANGED Viewed

@@ -175,6 +175,62 @@ class OWSMWithEntityLoss(PreTrainedModel):
             entity_weight=entity_weight
         )
         return model
     def save_pretrained(self, save_directory, **kwargs):

             entity_weight=entity_weight
         )
+        # Copy important attributes from base model to ensure full compatibility
+        # with transformers components like Seq2SeqTrainer, data collators, etc.
+        # 1. generation_config - Required for Seq2SeqTrainer evaluation
+        # Seq2SeqTrainer accesses model.generation_config._from_model_config in prediction_step
+        if hasattr(base_model, 'generation_config') and base_model.generation_config is not None:
+            # Copy generation_config from base model (preferred method)
+            model.generation_config = base_model.generation_config
+        else:
+            # Fallback: create generation_config from model config
+            # This handles cases where base model doesn't have generation_config set
+            try:
+                from transformers import GenerationConfig
+                model.generation_config = GenerationConfig.from_model_config(model.config)
+            except Exception:
+                # If GenerationConfig.from_model_config fails, create a minimal config
+                # This ensures generation_config is never None, preventing AttributeError
+                from transformers import GenerationConfig
+                model.generation_config = GenerationConfig()
+        # 1b. Ensure generation_config uses modern task/language flags instead of deprecated forced_decoder_ids
+        # For Whisper models, prefer task="transcribe" and language settings over forced_decoder_ids
+        # Setting task/language will cause forced_decoder_ids to be ignored (as per transformers deprecation)
+        if hasattr(model.generation_config, 'task'):
+            if model.generation_config.task is None:
+                # Set default task for Whisper models (transcribe, not translate)
+                model.generation_config.task = "transcribe"
+            # If task is set, forced_decoder_ids will be ignored, so we can clear it to avoid warnings
+            if hasattr(model.generation_config, 'forced_decoder_ids') and model.generation_config.forced_decoder_ids is not None:
+                # Clear forced_decoder_ids when task is set to avoid deprecation warnings
+                model.generation_config.forced_decoder_ids = None
+        # 1c. Ensure pad_token_id is set in generation_config to avoid attention mask warnings
+        # This is important when pad_token_id == eos_token_id
+        if hasattr(tokenizer, 'pad_token_id') and tokenizer.pad_token_id is not None:
+            if hasattr(model.generation_config, 'pad_token_id'):
+                model.generation_config.pad_token_id = tokenizer.pad_token_id
+        # If base model has language set, preserve it; otherwise default to None (auto-detect)
+        # Note: For Caribbean Voices, we want transcription, not translation to English
+        # So we don't force language='en' - let the model auto-detect or use what's in config
+        # 2. main_input_name - Important for data collators and input handling
+        # e.g., "input_features" for Whisper, "input_values" for Wav2Vec2
+        if hasattr(base_model, 'main_input_name'):
+            model.main_input_name = base_model.main_input_name
+        # 3. Model-specific config attributes that might be set on the instance
+        # Note: forced_decoder_ids is deprecated in favor of task/language flags in generation_config
+        # We still copy it for backward compatibility, but the modern approach is preferred
+        for attr_name in ['forced_decoder_ids', 'suppress_tokens']:
+            if hasattr(base_model, attr_name):
+                attr_value = getattr(base_model, attr_name)
+                if attr_value is not None:
+                    setattr(model, attr_name, attr_value)
         return model
     def save_pretrained(self, save_directory, **kwargs):

training/trainer.py CHANGED Viewed

@@ -1,9 +1,12 @@
 """Training logic for OWSM fine-tuning."""
 import os
 import json
 import torch
 import numpy as np
 import random
 from typing import Tuple, Optional, Dict, Any
 from datasets import load_dataset, Audio, DatasetDict, disable_caching
 from transformers import (
@@ -16,7 +19,7 @@ from transformers import (
     WhisperProcessor,
 )
 from owsm_model import OWSMWithEntityLoss
-from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR
 # Disable dataset caching to save disk space
 disable_caching()
@@ -82,10 +85,39 @@ def compute_wer_metric(predictions, labels, tokenizer):
     return {"wer": wer}
-def prepare_dataset_hf(dataset, processor):
     """
     Prepare dataset using Hugging Face Datasets built-in audio handling.
     """
     def prepare_batch(batch):
         """Process a batch of examples."""
@@ -131,11 +163,22 @@ def prepare_dataset_hf(dataset, processor):
         batched=True,
         batch_size=16,
         remove_columns=column_names,
-        desc="Preprocessing dataset",
         load_from_cache_file=False,  # Don't load from cache
         keep_in_memory=True,  # Keep in memory to avoid disk writes
     )
     return dataset
@@ -448,12 +491,26 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
         if progress:
             progress(0.3, desc="Preprocessing training dataset...")
         print("\nPreprocessing training dataset...")
-        train_dataset = prepare_dataset_hf(train_dataset_raw, processor)
         if progress:
             progress(0.4, desc="Preprocessing validation dataset...")
         print("Preprocessing validation dataset...")
-        val_dataset = prepare_dataset_hf(val_dataset_raw, processor)
         # Training arguments
         if progress:
@@ -479,10 +536,12 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
             save_total_limit=3,
             fp16=torch.cuda.is_available(),
             dataloader_num_workers=4,
             report_to="none",
             seed=SEED,
             predict_with_generate=True,  # Still used for seq2seq generation during eval
             generation_max_length=200, # Prevent infinite generation
         )
         # Data collator

 """Training logic for OWSM fine-tuning."""
 import os
+# Disable tokenizers parallelism to avoid fork warning with DataLoader workers
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import json
 import torch
 import numpy as np
 import random
+import hashlib
 from typing import Tuple, Optional, Dict, Any
 from datasets import load_dataset, Audio, DatasetDict, disable_caching
 from transformers import (
     WhisperProcessor,
 )
 from owsm_model import OWSMWithEntityLoss
+from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR, CACHE_DIR
 # Disable dataset caching to save disk space
 disable_caching()
     return {"wer": wer}
+def get_cache_key(dataset_name: str, model_name: str, split: str, seed: int) -> str:
+    """Generate a cache key based on dataset, model, split, and seed."""
+    cache_string = f"{dataset_name}_{model_name}_{split}_{seed}"
+    return hashlib.md5(cache_string.encode()).hexdigest()
+def prepare_dataset_hf(dataset, processor, dataset_name: str = None, model_name: str = None, split: str = None, use_cache: bool = True):
     """
     Prepare dataset using Hugging Face Datasets built-in audio handling.
+    Supports caching to avoid reprocessing.
+    Args:
+        dataset: The dataset to process
+        processor: The processor to use for preprocessing
+        dataset_name: Name of the dataset (for cache key)
+        model_name: Name of the model (for cache key)
+        split: Split name ('train' or 'val') (for cache key)
+        use_cache: Whether to use cache if available
     """
+    # Try to load from cache if enabled and cache key components provided
+    if use_cache and dataset_name and model_name and split:
+        cache_key = get_cache_key(dataset_name, model_name, split, SEED)
+        cache_path = os.path.join(CACHE_DIR, cache_key)
+        if os.path.exists(cache_path):
+            print(f"Loading preprocessed {split} dataset from cache: {cache_path}")
+            try:
+                from datasets import load_from_disk
+                cached_dataset = load_from_disk(cache_path)
+                print(f"✓ Successfully loaded cached {split} dataset ({len(cached_dataset):,} samples)")
+                return cached_dataset
+            except Exception as e:
+                print(f"⚠ Failed to load from cache: {e}. Reprocessing...")
     def prepare_batch(batch):
         """Process a batch of examples."""
         batched=True,
         batch_size=16,
         remove_columns=column_names,
+        desc=None,  # Disable progress bar for dataset preprocessing
         load_from_cache_file=False,  # Don't load from cache
         keep_in_memory=True,  # Keep in memory to avoid disk writes
     )
+    # Save to cache if enabled and cache key components provided
+    if use_cache and dataset_name and model_name and split:
+        cache_key = get_cache_key(dataset_name, model_name, split, SEED)
+        cache_path = os.path.join(CACHE_DIR, cache_key)
+        print(f"Saving preprocessed {split} dataset to cache: {cache_path}")
+        try:
+            dataset.save_to_disk(cache_path)
+            print(f"✓ Successfully cached {split} dataset ({len(dataset):,} samples)")
+        except Exception as e:
+            print(f"⚠ Failed to save to cache: {e}. Continuing without cache...")
     return dataset
         if progress:
             progress(0.3, desc="Preprocessing training dataset...")
         print("\nPreprocessing training dataset...")
+        train_dataset = prepare_dataset_hf(
+            train_dataset_raw,
+            processor,
+            dataset_name=HF_DATASET_NAME,
+            model_name=MODEL_NAME,
+            split="train",
+            use_cache=True
+        )
         if progress:
             progress(0.4, desc="Preprocessing validation dataset...")
         print("Preprocessing validation dataset...")
+        val_dataset = prepare_dataset_hf(
+            val_dataset_raw,
+            processor,
+            dataset_name=HF_DATASET_NAME,
+            model_name=MODEL_NAME,
+            split="val",
+            use_cache=True
+        )
         # Training arguments
         if progress:
             save_total_limit=3,
             fp16=torch.cuda.is_available(),
             dataloader_num_workers=4,
+            dataloader_pin_memory=True,  # Faster CPU→GPU transfers for GPU training
             report_to="none",
             seed=SEED,
             predict_with_generate=True,  # Still used for seq2seq generation during eval
             generation_max_length=200, # Prevent infinite generation
+            disable_tqdm=True,  # Disable progress bars during training
         )
         # Data collator

training/whisper_trainer.py CHANGED Viewed

@@ -3,10 +3,13 @@ Whisper training using HuggingFace transformers.
 Full integration with HuggingFace training features.
 """
 import os
 import json
 import torch
 import numpy as np
 import random
 from typing import Tuple, Optional, Dict, Any, List, Union
 from dataclasses import dataclass
 from datasets import load_dataset, Audio, disable_caching
@@ -17,7 +20,7 @@ from transformers import (
     EarlyStoppingCallback,
 )
 from owsm_model import OWSMWithEntityLoss
-from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR
 # Disable dataset caching to save disk space
 disable_caching()
@@ -93,12 +96,13 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         batch = {"input_features": input_features_batch}
         # Pad labels (text tokens) using the processor's tokenizer
-        # Pass as list of dicts with "input_ids" key
-        label_features_dicts = [{"input_ids": label.tolist() if isinstance(label, np.ndarray) else label} for label in labels_list]
-        labels_batch = self.processor.tokenizer.pad(
-            label_features_dicts,
             return_tensors="pt",
-            padding=True
         )
         # Replace padding token id's of the labels by -100 so they are ignored by the loss function
@@ -161,10 +165,39 @@ def compute_wer_metric(predictions, labels, tokenizer):
         return {"wer": np.mean(wer_scores)}
-def prepare_whisper_dataset(dataset, processor):
     """
     Prepare dataset for Whisper training using Hugging Face Datasets.
     """
     def prepare_batch(batch):
         """Process a batch of examples."""
@@ -231,11 +264,22 @@ def prepare_whisper_dataset(dataset, processor):
         batched=True,
         batch_size=16,
         remove_columns=column_names,
-        desc="Preprocessing dataset",
         load_from_cache_file=False,  # Don't load from cache
         keep_in_memory=True,  # Keep in memory to avoid disk writes
     )
     return dataset
@@ -299,6 +343,17 @@ def run_whisper_training_progress(epochs: int, batch_size: int, learning_rate: f
         processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)
         print(f"✓ Whisper processor loaded successfully")
         # Load Whisper model
         if progress:
             progress(0.25, desc=f"Loading Whisper model: {WHISPER_MODEL_NAME}...")
@@ -314,6 +369,10 @@ def run_whisper_training_progress(epochs: int, batch_size: int, learning_rate: f
             attn_implementation="eager",
         )
         print(f"✓ Whisper model loaded successfully")
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -324,12 +383,26 @@ def run_whisper_training_progress(epochs: int, batch_size: int, learning_rate: f
         if progress:
             progress(0.3, desc="Preprocessing training dataset...")
         print("\nPreprocessing training dataset...")
-        train_dataset = prepare_whisper_dataset(train_dataset_raw, processor)
         if progress:
             progress(0.4, desc="Preprocessing validation dataset...")
         print("Preprocessing validation dataset...")
-        val_dataset = prepare_whisper_dataset(val_dataset_raw, processor)
         # Training arguments
         if progress:
@@ -355,10 +428,12 @@ def run_whisper_training_progress(epochs: int, batch_size: int, learning_rate: f
             save_total_limit=3,
             fp16=torch.cuda.is_available(),
             dataloader_num_workers=4,
             report_to="none",
             seed=SEED,
             predict_with_generate=True,  # Still used for seq2seq generation during eval
             generation_max_length=200,
         )
         # Data collator

 Full integration with HuggingFace training features.
 """
 import os
+# Disable tokenizers parallelism to avoid fork warning with DataLoader workers
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import json
 import torch
 import numpy as np
 import random
+import hashlib
 from typing import Tuple, Optional, Dict, Any, List, Union
 from dataclasses import dataclass
 from datasets import load_dataset, Audio, disable_caching
     EarlyStoppingCallback,
 )
 from owsm_model import OWSMWithEntityLoss
+from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR, CACHE_DIR
 # Disable dataset caching to save disk space
 disable_caching()
         batch = {"input_features": input_features_batch}
         # Pad labels (text tokens) using the processor's tokenizer
+        # Use tokenizer.__call__() for better performance with fast tokenizers
+        label_ids_list = [label.tolist() if isinstance(label, np.ndarray) else label for label in labels_list]
+        labels_batch = self.processor.tokenizer(
+            label_ids_list,
             return_tensors="pt",
+            padding=True,
+            truncation=False  # Already handled in preprocessing
         )
         # Replace padding token id's of the labels by -100 so they are ignored by the loss function
         return {"wer": np.mean(wer_scores)}
+def get_cache_key(dataset_name: str, model_name: str, split: str, seed: int) -> str:
+    """Generate a cache key based on dataset, model, split, and seed."""
+    cache_string = f"{dataset_name}_{model_name}_{split}_{seed}"
+    return hashlib.md5(cache_string.encode()).hexdigest()
+def prepare_whisper_dataset(dataset, processor, dataset_name: str = None, model_name: str = None, split: str = None, use_cache: bool = True):
     """
     Prepare dataset for Whisper training using Hugging Face Datasets.
+    Supports caching to avoid reprocessing.
+    Args:
+        dataset: The dataset to process
+        processor: The processor to use for preprocessing
+        dataset_name: Name of the dataset (for cache key)
+        model_name: Name of the model (for cache key)
+        split: Split name ('train' or 'val') (for cache key)
+        use_cache: Whether to use cache if available
     """
+    # Try to load from cache if enabled and cache key components provided
+    if use_cache and dataset_name and model_name and split:
+        cache_key = get_cache_key(dataset_name, model_name, split, SEED)
+        cache_path = os.path.join(CACHE_DIR, cache_key)
+        if os.path.exists(cache_path):
+            print(f"Loading preprocessed {split} dataset from cache: {cache_path}")
+            try:
+                from datasets import load_from_disk
+                cached_dataset = load_from_disk(cache_path)
+                print(f"✓ Successfully loaded cached {split} dataset ({len(cached_dataset):,} samples)")
+                return cached_dataset
+            except Exception as e:
+                print(f"⚠ Failed to load from cache: {e}. Reprocessing...")
     def prepare_batch(batch):
         """Process a batch of examples."""
         batched=True,
         batch_size=16,
         remove_columns=column_names,
+        desc=None,  # Disable progress bar for dataset preprocessing
         load_from_cache_file=False,  # Don't load from cache
         keep_in_memory=True,  # Keep in memory to avoid disk writes
     )
+    # Save to cache if enabled and cache key components provided
+    if use_cache and dataset_name and model_name and split:
+        cache_key = get_cache_key(dataset_name, model_name, split, SEED)
+        cache_path = os.path.join(CACHE_DIR, cache_key)
+        print(f"Saving preprocessed {split} dataset to cache: {cache_path}")
+        try:
+            dataset.save_to_disk(cache_path)
+            print(f"✓ Successfully cached {split} dataset ({len(dataset):,} samples)")
+        except Exception as e:
+            print(f"⚠ Failed to save to cache: {e}. Continuing without cache...")
     return dataset
         processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)
         print(f"✓ Whisper processor loaded successfully")
+        # Fix pad_token issue: Whisper tokenizers often have pad_token_id == eos_token_id
+        # This causes warnings about attention masks. Set pad_token to unk_token if needed.
+        if processor.tokenizer.pad_token_id == processor.tokenizer.eos_token_id:
+            if processor.tokenizer.unk_token_id is not None:
+                processor.tokenizer.pad_token_id = processor.tokenizer.unk_token_id
+                processor.tokenizer.pad_token = processor.tokenizer.unk_token
+                print(f"✓ Set pad_token to unk_token ({processor.tokenizer.unk_token_id}) to avoid attention mask warnings")
+            else:
+                # If no unk_token, use eos_token but ensure attention masks are always passed
+                print(f"⚠ pad_token == eos_token ({processor.tokenizer.eos_token_id}). Ensure attention masks are passed during generation.")
         # Load Whisper model
         if progress:
             progress(0.25, desc=f"Loading Whisper model: {WHISPER_MODEL_NAME}...")
             attn_implementation="eager",
         )
+        # Update model config to match tokenizer pad_token_id
+        if hasattr(model.config, 'pad_token_id'):
+            model.config.pad_token_id = processor.tokenizer.pad_token_id
         print(f"✓ Whisper model loaded successfully")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         if progress:
             progress(0.3, desc="Preprocessing training dataset...")
         print("\nPreprocessing training dataset...")
+        train_dataset = prepare_whisper_dataset(
+            train_dataset_raw,
+            processor,
+            dataset_name=HF_DATASET_NAME,
+            model_name=WHISPER_MODEL_NAME,
+            split="train",
+            use_cache=True
+        )
         if progress:
             progress(0.4, desc="Preprocessing validation dataset...")
         print("Preprocessing validation dataset...")
+        val_dataset = prepare_whisper_dataset(
+            val_dataset_raw,
+            processor,
+            dataset_name=HF_DATASET_NAME,
+            model_name=WHISPER_MODEL_NAME,
+            split="val",
+            use_cache=True
+        )
         # Training arguments
         if progress:
             save_total_limit=3,
             fp16=torch.cuda.is_available(),
             dataloader_num_workers=4,
+            dataloader_pin_memory=True,  # Faster CPU→GPU transfers for GPU training
             report_to="none",
             seed=SEED,
             predict_with_generate=True,  # Still used for seq2seq generation during eval
             generation_max_length=200,
+            disable_tqdm=True,  # Disable progress bars during training
         )
         # Data collator