|
|
|
|
|
""" |
|
|
One-time script to upload Caribbean Voices data to a private Hugging Face Dataset. |
|
|
Run this once to push all data to HF, then the Space will load from there automatically. |
|
|
""" |
|
|
import os |
|
|
import sys |
|
|
import pandas as pd |
|
|
import zipfile |
|
|
from pathlib import Path |
|
|
from huggingface_hub import HfApi, login |
|
|
from datasets import Dataset, DatasetDict, Audio |
|
|
import json |
|
|
|
|
|
|
|
|
DATASET_NAME = "shaun3141/caribbean-voices-hackathon" |
|
|
PRIVATE = True |
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv") |
|
|
TEST_CSV = os.path.join(BASE_DIR, "Test.csv") |
|
|
AUDIO_ZIP = os.path.join(BASE_DIR, "Audio.zip") |
|
|
AUDIO_DIR = os.path.join(BASE_DIR, "audio_files") |
|
|
|
|
|
def check_files(): |
|
|
"""Check if required files exist""" |
|
|
missing = [] |
|
|
if not os.path.exists(TRAIN_CSV): |
|
|
missing.append(TRAIN_CSV) |
|
|
if not os.path.exists(TEST_CSV): |
|
|
missing.append(TEST_CSV) |
|
|
if not os.path.exists(AUDIO_DIR) and not os.path.exists(AUDIO_ZIP): |
|
|
missing.append("Audio directory or Audio.zip") |
|
|
|
|
|
if missing: |
|
|
print("❌ Missing files:") |
|
|
for f in missing: |
|
|
print(f" - {f}") |
|
|
return False |
|
|
return True |
|
|
|
|
|
def prepare_audio_dataset(): |
|
|
"""Prepare audio files for dataset""" |
|
|
print("Preparing audio files...") |
|
|
|
|
|
|
|
|
if os.path.exists(AUDIO_DIR): |
|
|
audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')] |
|
|
print(f"Found {len(audio_files)} audio files in {AUDIO_DIR}") |
|
|
return AUDIO_DIR, audio_files |
|
|
elif os.path.exists(AUDIO_ZIP): |
|
|
print(f"Found Audio.zip, will extract during upload") |
|
|
return AUDIO_ZIP, None |
|
|
else: |
|
|
raise FileNotFoundError("No audio files found") |
|
|
|
|
|
def create_dataset(): |
|
|
"""Create Hugging Face Dataset from local files""" |
|
|
print("=" * 70) |
|
|
print("CREATING HUGGING FACE DATASET") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
print("\n1. Loading CSV files...") |
|
|
train_df = pd.read_csv(TRAIN_CSV) |
|
|
test_df = pd.read_csv(TEST_CSV) |
|
|
print(f" Train: {len(train_df):,} samples") |
|
|
print(f" Test: {len(test_df):,} samples") |
|
|
|
|
|
|
|
|
print("\n2. Preparing audio files...") |
|
|
audio_dir, audio_files = prepare_audio_dataset() |
|
|
|
|
|
|
|
|
print("\n3. Creating dataset structure...") |
|
|
|
|
|
|
|
|
train_data = [] |
|
|
for idx, row in train_df.iterrows(): |
|
|
audio_id = row["ID"] |
|
|
audio_path = os.path.join(audio_dir, f"{audio_id}.wav") |
|
|
|
|
|
if os.path.exists(audio_path): |
|
|
train_data.append({ |
|
|
"id": audio_id, |
|
|
"transcription": str(row["Transcription"]), |
|
|
"audio": audio_path |
|
|
}) |
|
|
else: |
|
|
|
|
|
train_data.append({ |
|
|
"id": audio_id, |
|
|
"transcription": str(row["Transcription"]), |
|
|
"audio": None |
|
|
}) |
|
|
|
|
|
|
|
|
test_data = [] |
|
|
for idx, row in test_df.iterrows(): |
|
|
audio_id = row["ID"] |
|
|
audio_path = os.path.join(audio_dir, f"{audio_id}.wav") |
|
|
|
|
|
if os.path.exists(audio_path): |
|
|
test_data.append({ |
|
|
"id": audio_id, |
|
|
"transcription": "", |
|
|
"audio": audio_path |
|
|
}) |
|
|
else: |
|
|
test_data.append({ |
|
|
"id": audio_id, |
|
|
"transcription": "", |
|
|
"audio": None |
|
|
}) |
|
|
|
|
|
print(f" Train entries: {len(train_data):,}") |
|
|
print(f" Test entries: {len(test_data):,}") |
|
|
|
|
|
|
|
|
print("\n4. Creating Dataset objects...") |
|
|
train_dataset = Dataset.from_list(train_data) |
|
|
test_dataset = Dataset.from_list(test_data) |
|
|
|
|
|
|
|
|
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
|
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
|
|
|
|
|
|
|
dataset_dict = DatasetDict({ |
|
|
"train": train_dataset, |
|
|
"test": test_dataset |
|
|
}) |
|
|
|
|
|
return dataset_dict |
|
|
|
|
|
def upload_dataset(dataset_dict): |
|
|
"""Upload dataset to Hugging Face""" |
|
|
print("\n" + "=" * 70) |
|
|
print("UPLOADING TO HUGGING FACE") |
|
|
print("=" * 70) |
|
|
|
|
|
print(f"\nDataset name: {DATASET_NAME}") |
|
|
print(f"Private: {PRIVATE}") |
|
|
|
|
|
|
|
|
try: |
|
|
api = HfApi() |
|
|
user = api.whoami() |
|
|
print(f"\n✓ Logged in as: {user['name']}") |
|
|
except Exception as e: |
|
|
print("\n❌ Not logged in. Please run: huggingface-cli login") |
|
|
print(" Or use: from huggingface_hub import login; login()") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n5. Uploading dataset (this may take a while)...") |
|
|
try: |
|
|
dataset_dict.push_to_hub( |
|
|
DATASET_NAME, |
|
|
private=PRIVATE, |
|
|
max_shard_size="5GB" |
|
|
) |
|
|
print(f"\n✅ Dataset uploaded successfully!") |
|
|
print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"\n❌ Error uploading: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
def main(): |
|
|
"""Main upload pipeline""" |
|
|
print("=" * 70) |
|
|
print("CARIBBEAN VOICES DATASET UPLOADER") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
if not check_files(): |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
try: |
|
|
dataset_dict = create_dataset() |
|
|
except Exception as e: |
|
|
print(f"\n❌ Error creating dataset: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
success = upload_dataset(dataset_dict) |
|
|
|
|
|
if success: |
|
|
print("\n" + "=" * 70) |
|
|
print("✅ UPLOAD COMPLETE") |
|
|
print("=" * 70) |
|
|
print(f"\nDataset available at: https://huggingface.co/datasets/{DATASET_NAME}") |
|
|
print("\nNext steps:") |
|
|
print("1. Update app.py with dataset name") |
|
|
print("2. Deploy Space - it will auto-load from dataset") |
|
|
else: |
|
|
print("\n❌ Upload failed. Please check errors above.") |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|