shaun3141's picture
Refactoring gradio app
2941e2c
#!/usr/bin/env python3
"""
One-time script to upload Caribbean Voices data to a private Hugging Face Dataset.
Run this once to push all data to HF, then the Space will load from there automatically.
"""
import os
import sys
import pandas as pd
import zipfile
from pathlib import Path
from huggingface_hub import HfApi, login
from datasets import Dataset, DatasetDict, Audio
import json
# Configuration
DATASET_NAME = "shaun3141/caribbean-voices-hackathon" # Your username
PRIVATE = True # Make dataset private
# Paths - adjust based on your local setup
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv")
TEST_CSV = os.path.join(BASE_DIR, "Test.csv")
AUDIO_ZIP = os.path.join(BASE_DIR, "Audio.zip")
AUDIO_DIR = os.path.join(BASE_DIR, "audio_files")
def check_files():
"""Check if required files exist"""
missing = []
if not os.path.exists(TRAIN_CSV):
missing.append(TRAIN_CSV)
if not os.path.exists(TEST_CSV):
missing.append(TEST_CSV)
if not os.path.exists(AUDIO_DIR) and not os.path.exists(AUDIO_ZIP):
missing.append("Audio directory or Audio.zip")
if missing:
print("❌ Missing files:")
for f in missing:
print(f" - {f}")
return False
return True
def prepare_audio_dataset():
"""Prepare audio files for dataset"""
print("Preparing audio files...")
# Check if audio files are already extracted
if os.path.exists(AUDIO_DIR):
audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]
print(f"Found {len(audio_files)} audio files in {AUDIO_DIR}")
return AUDIO_DIR, audio_files
elif os.path.exists(AUDIO_ZIP):
print(f"Found Audio.zip, will extract during upload")
return AUDIO_ZIP, None
else:
raise FileNotFoundError("No audio files found")
def create_dataset():
"""Create Hugging Face Dataset from local files"""
print("=" * 70)
print("CREATING HUGGING FACE DATASET")
print("=" * 70)
# Load CSV files
print("\n1. Loading CSV files...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
print(f" Train: {len(train_df):,} samples")
print(f" Test: {len(test_df):,} samples")
# Prepare audio paths
print("\n2. Preparing audio files...")
audio_dir, audio_files = prepare_audio_dataset()
# Create datasets
print("\n3. Creating dataset structure...")
# For train dataset - include audio paths
train_data = []
for idx, row in train_df.iterrows():
audio_id = row["ID"]
audio_path = os.path.join(audio_dir, f"{audio_id}.wav")
if os.path.exists(audio_path):
train_data.append({
"id": audio_id,
"transcription": str(row["Transcription"]),
"audio": audio_path
})
else:
# Audio file missing - still include with None
train_data.append({
"id": audio_id,
"transcription": str(row["Transcription"]),
"audio": None
})
# For test dataset - add empty transcription to match train features
test_data = []
for idx, row in test_df.iterrows():
audio_id = row["ID"]
audio_path = os.path.join(audio_dir, f"{audio_id}.wav")
if os.path.exists(audio_path):
test_data.append({
"id": audio_id,
"transcription": "", # Empty for test set
"audio": audio_path
})
else:
test_data.append({
"id": audio_id,
"transcription": "", # Empty for test set
"audio": None
})
print(f" Train entries: {len(train_data):,}")
print(f" Test entries: {len(test_data):,}")
# Create Dataset objects
print("\n4. Creating Dataset objects...")
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
# Cast audio column to Audio feature
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
# Create DatasetDict
dataset_dict = DatasetDict({
"train": train_dataset,
"test": test_dataset
})
return dataset_dict
def upload_dataset(dataset_dict):
"""Upload dataset to Hugging Face"""
print("\n" + "=" * 70)
print("UPLOADING TO HUGGING FACE")
print("=" * 70)
print(f"\nDataset name: {DATASET_NAME}")
print(f"Private: {PRIVATE}")
# Check if logged in
try:
api = HfApi()
user = api.whoami()
print(f"\n✓ Logged in as: {user['name']}")
except Exception as e:
print("\n❌ Not logged in. Please run: huggingface-cli login")
print(" Or use: from huggingface_hub import login; login()")
return False
# Upload dataset
print("\n5. Uploading dataset (this may take a while)...")
try:
dataset_dict.push_to_hub(
DATASET_NAME,
private=PRIVATE,
max_shard_size="5GB" # Split large datasets
)
print(f"\n✅ Dataset uploaded successfully!")
print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}")
return True
except Exception as e:
print(f"\n❌ Error uploading: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main upload pipeline"""
print("=" * 70)
print("CARIBBEAN VOICES DATASET UPLOADER")
print("=" * 70)
# Check files
if not check_files():
sys.exit(1)
# Create dataset
try:
dataset_dict = create_dataset()
except Exception as e:
print(f"\n❌ Error creating dataset: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Upload
success = upload_dataset(dataset_dict)
if success:
print("\n" + "=" * 70)
print("✅ UPLOAD COMPLETE")
print("=" * 70)
print(f"\nDataset available at: https://huggingface.co/datasets/{DATASET_NAME}")
print("\nNext steps:")
print("1. Update app.py with dataset name")
print("2. Deploy Space - it will auto-load from dataset")
else:
print("\n❌ Upload failed. Please check errors above.")
sys.exit(1)
if __name__ == "__main__":
main()