Spaces:

shaun3141
/

caribbean-voices-hackathon

Sleeping

App Files Files Community

shaun3141 commited on Nov 22, 2025

Commit

0ac13f6

1 Parent(s): ef24863

Fix training pipeline: use composition for model wrapper and HF Datasets for audio loading

Browse files

Files changed (2) hide show

owsm_model.py +103 -39
training/trainer.py +71 -208

owsm_model.py CHANGED Viewed

@@ -4,31 +4,30 @@ This implements loss re-weighting for proper nouns without external data.
 """
 import torch
 import torch.nn as nn
-from transformers import AutoModelForSpeechSeq2Seq
-from typing import Set, Optional, Dict
-class OWSMWithEntityLoss(AutoModelForSpeechSeq2Seq):
     """
-    OWSM model with weighted cross-entropy loss that up-weights errors on entity tokens.
-    This is fully compliant with competition rules:
-    - No external data (entities come from training transcripts only)
-    - Single model (no reranker, no second model)
-    - Reproducible (deterministic loss computation)
-    FIXED: Now weights ALL tokens that make up an entity, not just the first token.
     """
-    def __init__(self, config, tokenizer, high_value_tokens: Set[str], entity_weight: float = 3.0):
         """
         Args:
             config: Model configuration
             tokenizer: Tokenizer for converting entity words to token IDs
             high_value_tokens: Set of entity words (lowercase) to up-weight
             entity_weight: Multiplier for entity token errors (default: 3.0)
         """
         super().__init__(config)
         self.tokenizer = tokenizer
         self.entity_weight = entity_weight
@@ -47,11 +46,13 @@ class OWSMWithEntityLoss(AutoModelForSpeechSeq2Seq):
                 all_entity_token_ids.update(token_id_set)
         print(f"  → Mapped to {len(all_entity_token_ids)} unique token IDs")
-        print(f"  → Average tokens per entity: {sum(len(ids) for ids in self.entity_word_to_token_ids.values()) / max(len(self.entity_word_to_token_ids), 1):.2f}")
         # Pre-compute vocab_weights tensor for O(1) lookup during training
         vocab_size = config.vocab_size if hasattr(config, 'vocab_size') else len(tokenizer)
-        self.vocab_weights = torch.ones(vocab_size, dtype=torch.float32)
         # Set entity token weights
         for token_id in all_entity_token_ids:
@@ -61,59 +62,122 @@ class OWSMWithEntityLoss(AutoModelForSpeechSeq2Seq):
         # Store for debugging
         self.entity_token_ids = all_entity_token_ids
         self.high_value_tokens = high_value_tokens
-    def compute_loss(self, model_outputs, labels, attention_mask=None):
-        """Compute weighted cross-entropy loss with higher weight for entity tokens."""
-        logits = model_outputs.logits  # [B, T, V]
-        # Shift for teacher forcing
-        shift_logits = logits[..., :-1, :].contiguous()
-        shift_labels = labels[..., 1:].contiguous()
         # Flatten
-        flat_logits = shift_logits.view(-1, shift_logits.size(-1))
-        flat_labels = shift_labels.view(-1)
-        # Compute per-token loss
-        loss_fct = nn.CrossEntropyLoss(reduction="none")
-        loss = loss_fct(flat_logits, flat_labels)
-        # Get weights using pre-computed vocab_weights tensor (O(1) lookup)
-        # Move vocab_weights to same device as labels
-        if self.vocab_weights.device != flat_labels.device:
-            self.vocab_weights = self.vocab_weights.to(flat_labels.device)
-        # Lookup weights for each token in the batch (O(1) operation)
-        weights = self.vocab_weights[flat_labels]
         # Apply weights
         weighted_loss = loss * weights
-        # Ignore padding tokens
-        padding_mask = (flat_labels != -100)
-        weighted_loss = weighted_loss[padding_mask]
         if weighted_loss.numel() == 0:
-            return loss[padding_mask].mean() if padding_mask.any() else loss.mean()
-        return weighted_loss.mean()
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str,
                        tokenizer, high_value_tokens: Set[str],
                        entity_weight: float = 3.0, **kwargs):
         """Load pretrained OWSM model and wrap with entity-weighted loss."""
         base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
         model = cls(
             config=base_model.config,
             tokenizer=tokenizer,
             high_value_tokens=high_value_tokens,
             entity_weight=entity_weight
         )
-        model.load_state_dict(base_model.state_dict(), strict=True)
         return model

 """
 import torch
 import torch.nn as nn
+from transformers import AutoModelForSpeechSeq2Seq, PreTrainedModel
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from typing import Set, Optional, Dict, Any
+class OWSMWithEntityLoss(PreTrainedModel):
     """
+    Wrapper around OWSM model that implements weighted cross-entropy loss
+    to up-weight errors on entity tokens.
+    This model wraps the base model using composition rather than inheritance
+    to avoid issues with the AutoModel factory pattern.
     """
+    def __init__(self, config, base_model, tokenizer, high_value_tokens: Set[str], entity_weight: float = 3.0):
         """
         Args:
             config: Model configuration
+            base_model: The instantiated base model (SpeechEncoderDecoderModel)
             tokenizer: Tokenizer for converting entity words to token IDs
             high_value_tokens: Set of entity words (lowercase) to up-weight
             entity_weight: Multiplier for entity token errors (default: 3.0)
         """
         super().__init__(config)
+        self.model = base_model
         self.tokenizer = tokenizer
         self.entity_weight = entity_weight
                 all_entity_token_ids.update(token_id_set)
         print(f"  → Mapped to {len(all_entity_token_ids)} unique token IDs")
+        if self.entity_word_to_token_ids:
+            avg_tokens = sum(len(ids) for ids in self.entity_word_to_token_ids.values()) / len(self.entity_word_to_token_ids)
+            print(f"  → Average tokens per entity: {avg_tokens:.2f}")
         # Pre-compute vocab_weights tensor for O(1) lookup during training
         vocab_size = config.vocab_size if hasattr(config, 'vocab_size') else len(tokenizer)
+        self.register_buffer('vocab_weights', torch.ones(vocab_size, dtype=torch.float32))
         # Set entity token weights
         for token_id in all_entity_token_ids:
         # Store for debugging
         self.entity_token_ids = all_entity_token_ids
         self.high_value_tokens = high_value_tokens
+    def get_encoder(self):
+        """Delegate to sub-model's encoder."""
+        return self.model.get_encoder()
+    def get_decoder(self):
+        """Delegate to sub-model's decoder."""
+        return self.model.get_decoder()
+    def forward(self, input_features=None, attention_mask=None, decoder_input_ids=None, labels=None, **kwargs):
+        """
+        Forward pass that computes weighted loss if labels are provided.
+        Delegates to underlying model.
+        """
+        outputs = self.model(
+            input_features=input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            labels=labels,
+            return_dict=True,
+            **kwargs
+        )
+        # If we are not training or have no labels, return standard outputs
+        if labels is None:
+            return outputs
+        # Custom Loss Computation
+        logits = outputs.logits  # [B, T, V]
         # Flatten
+        # Standard CrossEntropyLoss expects [N, C] logits and [N] labels
+        # where N is batch_size * sequence_length
+        flat_logits = logits.view(-1, logits.size(-1))
+        flat_labels = labels.view(-1)
+        # Create per-token weights
+        # Use pre-computed weights: O(1) lookup
+        # labels can be -100 (ignore), we need to handle that for lookup
+        # Create a mask for valid labels (not -100)
+        valid_mask = (flat_labels != -100)
+        # Use padding token ID (usually 0 or 1) for lookup where label is -100
+        # This avoids index out of bounds. We'll mask the loss anyway.
+        safe_labels = flat_labels.clone()
+        safe_labels[~valid_mask] = 0
+        # Get weights
+        weights = self.vocab_weights[safe_labels]
+        # Compute unreduced loss
+        loss_fct = nn.CrossEntropyLoss(reduction="none")
+        loss = loss_fct(flat_logits, flat_labels)
         # Apply weights
         weighted_loss = loss * weights
+        # Apply masking (CrossEntropyLoss usually handles -100 by ignoring,
+        # but since we used reduction='none', we have to double check)
+        # The loss for -100 labels should be 0 from CrossEntropyLoss if used correctly,
+        # but explicit masking is safer with custom weighting.
+        weighted_loss = weighted_loss[valid_mask]
         if weighted_loss.numel() == 0:
+            final_loss = torch.tensor(0.0, device=logits.device, requires_grad=True)
+        else:
+            final_loss = weighted_loss.mean()
+        return Seq2SeqLMOutput(
+            loss=final_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+    def generate(self, *args, **kwargs):
+        """Delegate generation to the underlying model."""
+        return self.model.generate(*args, **kwargs)
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        """Delegate to underlying model."""
+        return self.model.prepare_inputs_for_generation(*args, **kwargs)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str,
                        tokenizer, high_value_tokens: Set[str],
                        entity_weight: float = 3.0, **kwargs):
         """Load pretrained OWSM model and wrap with entity-weighted loss."""
+        # Load the base model using the Auto class
         base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
+        # Initialize wrapper
         model = cls(
             config=base_model.config,
+            base_model=base_model,
             tokenizer=tokenizer,
             high_value_tokens=high_value_tokens,
             entity_weight=entity_weight
         )
         return model
+    def save_pretrained(self, save_directory, **kwargs):
+        """
+        Save the underlying model to the directory.
+        This ensures that the saved model is a standard OWSM model
+        that can be loaded with AutoModelForSpeechSeq2Seq for inference.
+        """
+        print(f"Saving underlying model to {save_directory}...")
+        self.model.save_pretrained(save_directory, **kwargs)

training/trainer.py CHANGED Viewed

@@ -4,24 +4,17 @@ import json
 import torch
 import numpy as np
 import random
-import pandas as pd
-from typing import Tuple, Optional
-from pathlib import Path
-from datasets import Dataset, Audio
 from transformers import (
     AutoProcessor,
-    AutoModelForSpeechSeq2Seq,
     Seq2SeqTrainingArguments,
     Seq2SeqTrainer,
     DataCollatorForSeq2Seq,
     EarlyStoppingCallback,
 )
-from sklearn.model_selection import train_test_split
-import torchaudio
-from data.manager import ENTITIES_PATH, AUDIO_DIR, MODEL_OUTPUT_DIR
-from data.loader import get_train_dataframe
 from owsm_model import OWSMWithEntityLoss
 # Set seeds for reproducibility
 SEED = 42
@@ -36,7 +29,7 @@ torch.use_deterministic_algorithms(True, warn_only=True)
 MODEL_NAME = "espnet/owsm_v3.1_ebf_small"
 TARGET_SR = 16000
 MAX_AUDIO_LENGTH = 30  # seconds
 def compute_wer_metric(predictions, labels, tokenizer):
     """Compute Word Error Rate metric."""
@@ -51,13 +44,11 @@ def compute_wer_metric(predictions, labels, tokenizer):
                 return 1.0 if len(hyp_words) > 0 else 0.0
             # Simple Levenshtein-like WER
-            # For simplicity, use character-level edit distance approximation
             ref_str = ' '.join(ref_words)
             hyp_str = ' '.join(hyp_words)
             if ref_str == hyp_str:
                 return 0.0
-            # Count word-level differences
             ref_set = set(ref_words)
             hyp_set = set(hyp_words)
             common = len(ref_set & hyp_set)
@@ -66,7 +57,6 @@ def compute_wer_metric(predictions, labels, tokenizer):
         # Decode predictions and labels
         decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
         # Replace -100 with pad token for decoding
         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
@@ -87,76 +77,22 @@ def compute_wer_metric(predictions, labels, tokenizer):
     return {"wer": wer}
-def load_audio(path: str, target_sr: int = TARGET_SR) -> np.ndarray:
     """
-    Load and resample audio to target sample rate, convert to mono.
-    FAILS LOUDLY if file doesn't exist - no silent fallbacks.
-    """
-    if not os.path.exists(path):
-        raise FileNotFoundError(
-            f"Audio file not found: {path}\n"
-            f"Expected audio file at: {os.path.abspath(path)}\n"
-            f"Please ensure all audio files are available before training."
-        )
-    try:
-        wav, sr = torchaudio.load(path)
-    except Exception as e:
-        raise RuntimeError(
-            f"Failed to load audio file: {path}\n"
-            f"Error: {str(e)}\n"
-            f"Please check that the file is a valid audio file."
-        ) from e
-    if sr != target_sr:
-        wav = torchaudio.functional.resample(wav, sr, target_sr)
-    # Convert to mono if stereo
-    if wav.shape[0] > 1:
-        wav = wav.mean(0, keepdim=True)
-    return wav.squeeze(0).numpy()  # Return as numpy array for processor
-def prepare_dataset(df: pd.DataFrame, audio_dir: str, processor, is_training: bool = True):
-    """
-    Prepare dataset for training/inference.
-    FAILS LOUDLY if audio files are missing - no silent fallbacks.
     """
     def prepare_batch(batch):
         """Process a batch of examples."""
-        audio_paths = batch["audio_path"]
-        transcriptions = batch["Transcription"]
-        # Load and process audio - FAIL LOUDLY if missing
-        audio_arrays = []
-        for audio_path in audio_paths:
-            full_path = os.path.join(audio_dir, audio_path)
-            # Validate file exists BEFORE processing
-            if not os.path.exists(full_path):
-                raise FileNotFoundError(
-                    f"Audio file not found during dataset preparation: {audio_path}\n"
-                    f"Full path: {os.path.abspath(full_path)}\n"
-                    f"Audio directory: {os.path.abspath(audio_dir)}\n"
-                    f"Please ensure all audio files are available before training."
-                )
-            audio = load_audio(full_path)
-            # Truncate if too long
-            max_samples = TARGET_SR * MAX_AUDIO_LENGTH
-            if len(audio) > max_samples:
-                audio = audio[:max_samples]
-            audio_arrays.append(audio)
         # Process audio with processor
         inputs = processor(
-            audio_arrays,
             sampling_rate=TARGET_SR,
             return_tensors="pt",
             padding=True,
@@ -178,85 +114,26 @@ def prepare_dataset(df: pd.DataFrame, audio_dir: str, processor, is_training: bo
         return batch
-    # Create dataset
-    dataset_dict = {
-        "audio_path": df["ID"].apply(lambda x: f"{x}.wav").tolist(),
-        "Transcription": df["Transcription"].astype(str).tolist(),
-    }
-    # Validate all audio files exist BEFORE creating dataset
-    if is_training:
-        print("Validating audio files exist...")
-        missing_files = []
-        for audio_path in dataset_dict["audio_path"]:
-            full_path = os.path.join(audio_dir, audio_path)
-            if not os.path.exists(full_path):
-                missing_files.append(full_path)
-        if missing_files:
-            error_msg = (
-                f"Found {len(missing_files)} missing audio files:\n"
-                f"First 10 missing files:\n"
-            )
-            for f in missing_files[:10]:
-                error_msg += f"  - {f}\n"
-            if len(missing_files) > 10:
-                error_msg += f"  ... and {len(missing_files) - 10} more\n"
-            error_msg += f"\nPlease ensure all audio files are available before training."
-            raise FileNotFoundError(error_msg)
-        print(f"✓ All {len(dataset_dict['audio_path'])} audio files validated")
-    dataset = Dataset.from_dict(dataset_dict)
     # Process in batches
     dataset = dataset.map(
         prepare_batch,
         batched=True,
         batch_size=16,
-        remove_columns=["audio_path"],  # Keep Transcription for reference
     )
     return dataset
-def create_stratified_split(df: pd.DataFrame, test_size: float = 0.1):
-    """
-    Create stratified train/val split based on:
-    - Utterance length (short vs long)
-    - Presence of Caribbean keywords
-    """
-    # Create bins for stratification
-    df['word_count'] = df['Transcription'].str.split().str.len()
-    df['has_caribbean'] = df['Transcription'].str.lower().str.contains(
-        'caribbean|bbc|trinidad|jamaica|guyana|haiti|barbados',
-        regex=True,
-        na=False
-    )
-    # Create stratification key
-    df['length_bin'] = pd.cut(df['word_count'], bins=5, labels=False)
-    df['stratify_key'] = df['length_bin'].astype(str) + '_' + df['has_caribbean'].astype(str)
-    train_df, val_df = train_test_split(
-        df,
-        test_size=test_size,
-        stratify=df['stratify_key'],
-        random_state=SEED
-    )
-    print(f"Train: {len(train_df):,} samples")
-    print(f"Val: {len(val_df):,} samples")
-    return train_df, val_df
-def run_training_progress(epochs: int, batch_size: int, learning_rate: float, progress=None) -> Tuple[str, str]:
     """
-    Run OWSM training with progress tracking.
-    Uses espnet/owsm_v3.1_ebf_small with NO FALLBACKS.
-    If model loading fails, raises exception with clear error message.
     """
     try:
         if progress:
@@ -278,72 +155,63 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
         print(f"Loaded {len(high_value_entities)} high-value entities")
         if progress:
-            progress(0.1, desc="Loading training data from dataset...")
-        try:
-            train_df = get_train_dataframe()
-        except ValueError as e:
-            raise FileNotFoundError(
-                f"Training data not available: {str(e)}. "
-                f"Please ensure the dataset is loaded."
-            )
-        print(f"Loaded {len(train_df):,} training samples")
         # Create train/val split
         if progress:
             progress(0.15, desc="Creating train/val split...")
-        train_df_split, val_df_split = create_stratified_split(train_df, test_size=0.1)
-        # Load processor - NO FALLBACKS
         if progress:
             progress(0.2, desc=f"Loading processor: {MODEL_NAME}...")
         print(f"\nLoading processor: {MODEL_NAME}")
-        print("NOTE: Using espnet/owsm_v3.1_ebf_small with NO FALLBACKS")
-        print("If this fails, check that the model is available on HuggingFace.")
         try:
             processor = AutoProcessor.from_pretrained(MODEL_NAME)
         except Exception as e:
-            error_msg = (
-                f"FAILED to load processor from {MODEL_NAME}\n\n"
-                f"Error: {str(e)}\n\n"
-                f"This training pipeline requires espnet/owsm_v3.1_ebf_small.\n"
-                f"No fallbacks are configured. Please ensure:\n"
-                f"1. The model is available on HuggingFace\n"
-                f"2. You have internet access to download the model\n"
-                f"3. You have sufficient disk space\n"
-                f"4. The transformers library supports this model\n\n"
-                f"If the model is not available, you may need to use ESPnet's native training framework."
-            )
-            raise RuntimeError(error_msg) from e
         print(f"✓ Processor loaded successfully")
-        # Load model - NO FALLBACKS
         if progress:
             progress(0.25, desc=f"Loading model: {MODEL_NAME}...")
         print(f"\nLoading model: {MODEL_NAME}")
         try:
             model = OWSMWithEntityLoss.from_pretrained(
                 MODEL_NAME,
                 tokenizer=processor.tokenizer,
                 high_value_tokens=high_value_entities,
-                entity_weight=3.0,  # 3x weight for entity errors
             )
         except Exception as e:
-            error_msg = (
-                f"FAILED to load model from {MODEL_NAME}\n\n"
-                f"Error: {str(e)}\n\n"
-                f"This training pipeline requires espnet/owsm_v3.1_ebf_small.\n"
-                f"No fallbacks are configured. Please ensure:\n"
-                f"1. The model is available on HuggingFace\n"
-                f"2. You have internet access to download the model\n"
-                f"3. You have sufficient disk space\n"
-                f"4. The transformers library supports this model\n"
-                f"5. AutoModelForSpeechSeq2Seq can load this model\n\n"
-                f"If the model is not available, you may need to use ESPnet's native training framework."
-            )
-            raise RuntimeError(error_msg) from e
         print(f"✓ Model loaded successfully")
@@ -353,14 +221,14 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
         # Prepare datasets
         if progress:
-            progress(0.3, desc="Preparing training dataset...")
-        print("\nPreparing training dataset...")
-        train_dataset = prepare_dataset(train_df_split, AUDIO_DIR, processor, is_training=True)
         if progress:
-            progress(0.4, desc="Preparing validation dataset...")
-        print("Preparing validation dataset...")
-        val_dataset = prepare_dataset(val_df_split, AUDIO_DIR, processor, is_training=False)
         # Training arguments
         if progress:
@@ -370,7 +238,7 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
             output_dir=MODEL_OUTPUT_DIR,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
-            gradient_accumulation_steps=4,  # Effective batch size = batch_size * 4
             learning_rate=learning_rate,
             warmup_steps=500,
             num_train_epochs=epochs,
@@ -380,20 +248,23 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
             save_steps=1000,
             logging_steps=100,
             load_best_model_at_end=True,
-            metric_for_best_model="wer",  # Use WER instead of loss
-            greater_is_better=False,  # Lower WER is better
             save_total_limit=3,
-            fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
             dataloader_num_workers=4,
-            report_to="none",  # Disable wandb/tensorboard
             seed=SEED,
-            predict_with_generate=True,  # Need to generate for WER calculation
         )
         # Data collator
         data_collator = DataCollatorForSeq2Seq(
             processor=processor,
-            model=model,
             padding=True,
         )
@@ -435,6 +306,7 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
             progress(0.95, desc="Saving model...")
         print(f"\nSaving model to {MODEL_OUTPUT_DIR}...")
         model.save_pretrained(MODEL_OUTPUT_DIR)
         processor.save_pretrained(MODEL_OUTPUT_DIR)
@@ -471,21 +343,12 @@ def run_training_progress(epochs: int, batch_size: int, learning_rate: float, pr
 The model is now ready for inference!
 """
-        return success_msg, json.dumps(final_metrics, indent=2)
-    except FileNotFoundError as e:
-        error_msg = f"❌ File Not Found Error:\n\n{str(e)}"
-        if progress:
-            progress(1.0, desc="Error!")
-        return error_msg, ""
-    except RuntimeError as e:
-        error_msg = f"❌ Runtime Error:\n\n{str(e)}"
-        if progress:
-            progress(1.0, desc="Error!")
-        return error_msg, ""
     except Exception as e:
         import traceback
-        error_msg = f"❌ Unexpected Error: {str(e)}\n\n{traceback.format_exc()}"
         if progress:
             progress(1.0, desc="Error!")
-        return error_msg, ""

 import torch
 import numpy as np
 import random
+from typing import Tuple, Optional, Dict, Any
+from datasets import load_dataset, Audio, DatasetDict
 from transformers import (
     AutoProcessor,
     Seq2SeqTrainingArguments,
     Seq2SeqTrainer,
     DataCollatorForSeq2Seq,
     EarlyStoppingCallback,
 )
 from owsm_model import OWSMWithEntityLoss
+from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR
 # Set seeds for reproducibility
 SEED = 42
 MODEL_NAME = "espnet/owsm_v3.1_ebf_small"
 TARGET_SR = 16000
 MAX_AUDIO_LENGTH = 30  # seconds
+HF_DATASET_NAME = "shaun3141/caribbean-voices-hackathon"
 def compute_wer_metric(predictions, labels, tokenizer):
     """Compute Word Error Rate metric."""
                 return 1.0 if len(hyp_words) > 0 else 0.0
             # Simple Levenshtein-like WER
             ref_str = ' '.join(ref_words)
             hyp_str = ' '.join(hyp_words)
             if ref_str == hyp_str:
                 return 0.0
             ref_set = set(ref_words)
             hyp_set = set(hyp_words)
             common = len(ref_set & hyp_set)
         # Decode predictions and labels
         decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
         # Replace -100 with pad token for decoding
         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
     return {"wer": wer}
+def prepare_dataset_hf(dataset, processor):
     """
+    Prepare dataset using Hugging Face Datasets built-in audio handling.
     """
     def prepare_batch(batch):
         """Process a batch of examples."""
+        audio = batch["audio"]
+        transcriptions = batch["transcription"]  # Note: check lowercase 't' in dataset
+        # Audio is already a dictionary with 'array' and 'sampling_rate'
+        # because we cast it to Audio() in the loading step
         # Process audio with processor
         inputs = processor(
+            [x["array"] for x in audio],
             sampling_rate=TARGET_SR,
             return_tensors="pt",
             padding=True,
         return batch
+    # Remove columns that are not needed
+    # Note: We keep 'transcription' maybe? No, remove it to save memory, we have labels.
+    # But check what columns exist first.
+    column_names = dataset.column_names
     # Process in batches
     dataset = dataset.map(
         prepare_batch,
         batched=True,
         batch_size=16,
+        remove_columns=column_names,
+        desc="Preprocessing dataset",
     )
     return dataset
+def run_training_progress(epochs: int, batch_size: int, learning_rate: float, progress=None) -> Tuple[str, Optional[Dict[str, Any]]]:
     """
+    Run OWSM training with progress tracking using HF Datasets.
     """
     try:
         if progress:
         print(f"Loaded {len(high_value_entities)} high-value entities")
         if progress:
+            progress(0.1, desc="Loading dataset from Hugging Face...")
+        # Load dataset from HF
+        hf_token = os.getenv("HF_TOKEN")
+        print(f"Loading dataset: {HF_DATASET_NAME}")
+        dataset = load_dataset(HF_DATASET_NAME, token=hf_token)
+        if 'train' not in dataset:
+             raise ValueError(f"Dataset {HF_DATASET_NAME} does not contain a 'train' split.")
+        train_full = dataset['train']
+        print(f"Loaded {len(train_full):,} total training samples")
+        # Cast to Audio to ensure correct sampling rate
+        train_full = train_full.cast_column("audio", Audio(sampling_rate=TARGET_SR))
         # Create train/val split
         if progress:
             progress(0.15, desc="Creating train/val split...")
+        # Simple random split since we don't want to download all audio to stratify by length/content
+        # unless we want to iterate the whole dataset which might be slow.
+        # We'll use a random split for speed and simplicity with the streamed/remote dataset.
+        split_dataset = train_full.train_test_split(test_size=0.1, seed=SEED)
+        train_dataset_raw = split_dataset['train']
+        val_dataset_raw = split_dataset['test']
+        print(f"Train: {len(train_dataset_raw):,} samples")
+        print(f"Val: {len(val_dataset_raw):,} samples")
+        # Load processor
         if progress:
             progress(0.2, desc=f"Loading processor: {MODEL_NAME}...")
         print(f"\nLoading processor: {MODEL_NAME}")
         try:
             processor = AutoProcessor.from_pretrained(MODEL_NAME)
         except Exception as e:
+            raise RuntimeError(f"FAILED to load processor from {MODEL_NAME}: {e}")
         print(f"✓ Processor loaded successfully")
+        # Load model
         if progress:
             progress(0.25, desc=f"Loading model: {MODEL_NAME}...")
         print(f"\nLoading model: {MODEL_NAME}")
         try:
+            # Use our new wrapper class
             model = OWSMWithEntityLoss.from_pretrained(
                 MODEL_NAME,
                 tokenizer=processor.tokenizer,
                 high_value_tokens=high_value_entities,
+                entity_weight=3.0,
             )
         except Exception as e:
+            raise RuntimeError(f"FAILED to load model from {MODEL_NAME}: {e}")
         print(f"✓ Model loaded successfully")
         # Prepare datasets
         if progress:
+            progress(0.3, desc="Preprocessing training dataset...")
+        print("\nPreprocessing training dataset...")
+        train_dataset = prepare_dataset_hf(train_dataset_raw, processor)
         if progress:
+            progress(0.4, desc="Preprocessing validation dataset...")
+        print("Preprocessing validation dataset...")
+        val_dataset = prepare_dataset_hf(val_dataset_raw, processor)
         # Training arguments
         if progress:
             output_dir=MODEL_OUTPUT_DIR,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
+            gradient_accumulation_steps=4,
             learning_rate=learning_rate,
             warmup_steps=500,
             num_train_epochs=epochs,
             save_steps=1000,
             logging_steps=100,
             load_best_model_at_end=True,
+            metric_for_best_model="wer",
+            greater_is_better=False,
             save_total_limit=3,
+            fp16=torch.cuda.is_available(),
             dataloader_num_workers=4,
+            report_to="none",
             seed=SEED,
+            predict_with_generate=True,
+            generation_max_length=200, # Prevent infinite generation
         )
         # Data collator
         data_collator = DataCollatorForSeq2Seq(
             processor=processor,
+            model=model, # The trainer needs the model for the collator sometimes if it uses it for padding?
+                         # Actually DataCollatorForSeq2Seq uses tokenizer usually.
+                         # But passing model is fine.
             padding=True,
         )
             progress(0.95, desc="Saving model...")
         print(f"\nSaving model to {MODEL_OUTPUT_DIR}...")
+        # This calls our custom save_pretrained which saves the inner model
         model.save_pretrained(MODEL_OUTPUT_DIR)
         processor.save_pretrained(MODEL_OUTPUT_DIR)
 The model is now ready for inference!
 """
+        return success_msg, final_metrics
     except Exception as e:
         import traceback
+        error_msg = f"❌ Error during training: {str(e)}\n\n{traceback.format_exc()}"
+        print(error_msg)
         if progress:
             progress(1.0, desc="Error!")
+        return error_msg, None