#!/usr/bin/env python3 """ One-time script to upload Caribbean Voices data to a private Hugging Face Dataset. Run this once to push all data to HF, then the Space will load from there automatically. """ import os import sys import pandas as pd import zipfile from pathlib import Path from huggingface_hub import HfApi, login from datasets import Dataset, DatasetDict, Audio import json # Configuration DATASET_NAME = "shaun3141/caribbean-voices-hackathon" # Your username PRIVATE = True # Make dataset private # Paths - adjust based on your local setup BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv") TEST_CSV = os.path.join(BASE_DIR, "Test.csv") AUDIO_ZIP = os.path.join(BASE_DIR, "Audio.zip") AUDIO_DIR = os.path.join(BASE_DIR, "audio_files") def check_files(): """Check if required files exist""" missing = [] if not os.path.exists(TRAIN_CSV): missing.append(TRAIN_CSV) if not os.path.exists(TEST_CSV): missing.append(TEST_CSV) if not os.path.exists(AUDIO_DIR) and not os.path.exists(AUDIO_ZIP): missing.append("Audio directory or Audio.zip") if missing: print("❌ Missing files:") for f in missing: print(f" - {f}") return False return True def prepare_audio_dataset(): """Prepare audio files for dataset""" print("Preparing audio files...") # Check if audio files are already extracted if os.path.exists(AUDIO_DIR): audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')] print(f"Found {len(audio_files)} audio files in {AUDIO_DIR}") return AUDIO_DIR, audio_files elif os.path.exists(AUDIO_ZIP): print(f"Found Audio.zip, will extract during upload") return AUDIO_ZIP, None else: raise FileNotFoundError("No audio files found") def create_dataset(): """Create Hugging Face Dataset from local files""" print("=" * 70) print("CREATING HUGGING FACE DATASET") print("=" * 70) # Load CSV files print("\n1. Loading CSV files...") train_df = pd.read_csv(TRAIN_CSV) test_df = pd.read_csv(TEST_CSV) print(f" Train: {len(train_df):,} samples") print(f" Test: {len(test_df):,} samples") # Prepare audio paths print("\n2. Preparing audio files...") audio_dir, audio_files = prepare_audio_dataset() # Create datasets print("\n3. Creating dataset structure...") # For train dataset - include audio paths train_data = [] for idx, row in train_df.iterrows(): audio_id = row["ID"] audio_path = os.path.join(audio_dir, f"{audio_id}.wav") if os.path.exists(audio_path): train_data.append({ "id": audio_id, "transcription": str(row["Transcription"]), "audio": audio_path }) else: # Audio file missing - still include with None train_data.append({ "id": audio_id, "transcription": str(row["Transcription"]), "audio": None }) # For test dataset - add empty transcription to match train features test_data = [] for idx, row in test_df.iterrows(): audio_id = row["ID"] audio_path = os.path.join(audio_dir, f"{audio_id}.wav") if os.path.exists(audio_path): test_data.append({ "id": audio_id, "transcription": "", # Empty for test set "audio": audio_path }) else: test_data.append({ "id": audio_id, "transcription": "", # Empty for test set "audio": None }) print(f" Train entries: {len(train_data):,}") print(f" Test entries: {len(test_data):,}") # Create Dataset objects print("\n4. Creating Dataset objects...") train_dataset = Dataset.from_list(train_data) test_dataset = Dataset.from_list(test_data) # Cast audio column to Audio feature train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000)) test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000)) # Create DatasetDict dataset_dict = DatasetDict({ "train": train_dataset, "test": test_dataset }) return dataset_dict def upload_dataset(dataset_dict): """Upload dataset to Hugging Face""" print("\n" + "=" * 70) print("UPLOADING TO HUGGING FACE") print("=" * 70) print(f"\nDataset name: {DATASET_NAME}") print(f"Private: {PRIVATE}") # Check if logged in try: api = HfApi() user = api.whoami() print(f"\n✓ Logged in as: {user['name']}") except Exception as e: print("\n❌ Not logged in. Please run: huggingface-cli login") print(" Or use: from huggingface_hub import login; login()") return False # Upload dataset print("\n5. Uploading dataset (this may take a while)...") try: dataset_dict.push_to_hub( DATASET_NAME, private=PRIVATE, max_shard_size="5GB" # Split large datasets ) print(f"\n✅ Dataset uploaded successfully!") print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}") return True except Exception as e: print(f"\n❌ Error uploading: {e}") import traceback traceback.print_exc() return False def main(): """Main upload pipeline""" print("=" * 70) print("CARIBBEAN VOICES DATASET UPLOADER") print("=" * 70) # Check files if not check_files(): sys.exit(1) # Create dataset try: dataset_dict = create_dataset() except Exception as e: print(f"\n❌ Error creating dataset: {e}") import traceback traceback.print_exc() sys.exit(1) # Upload success = upload_dataset(dataset_dict) if success: print("\n" + "=" * 70) print("✅ UPLOAD COMPLETE") print("=" * 70) print(f"\nDataset available at: https://huggingface.co/datasets/{DATASET_NAME}") print("\nNext steps:") print("1. Update app.py with dataset name") print("2. Deploy Space - it will auto-load from dataset") else: print("\n❌ Upload failed. Please check errors above.") sys.exit(1) if __name__ == "__main__": main()