#!/usr/bin/env python3 """ Simplified upload: Upload only CSV files to HF Dataset. Audio files will be handled via Space storage (upload Audio.zip once). """ import os import sys import pandas as pd from huggingface_hub import HfApi from datasets import Dataset, DatasetDict # Configuration DATASET_NAME = "shaun3141/caribbean-voices-hackathon" PRIVATE = True # Paths BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv") TEST_CSV = os.path.join(BASE_DIR, "Test.csv") def upload_csv_dataset(): """Upload CSV files to HF Dataset (without audio)""" print("=" * 70) print("UPLOADING CSV FILES TO HUGGING FACE DATASET") print("=" * 70) # Check files if not os.path.exists(TRAIN_CSV): print(f"❌ Train.csv not found: {TRAIN_CSV}") return False if not os.path.exists(TEST_CSV): print(f"❌ Test.csv not found: {TEST_CSV}") return False # Load CSVs print("\n1. Loading CSV files...") train_df = pd.read_csv(TRAIN_CSV) test_df = pd.read_csv(TEST_CSV) print(f" Train: {len(train_df):,} samples") print(f" Test: {len(test_df):,} samples") # Add empty Transcription column to test to match train features if 'Transcription' not in test_df.columns: test_df['Transcription'] = "" # Create datasets (CSV only, no audio) print("\n2. Creating dataset structure...") train_dataset = Dataset.from_pandas(train_df) test_dataset = Dataset.from_pandas(test_df) dataset_dict = DatasetDict({ "train": train_dataset, "test": test_dataset }) # Check login print("\n3. Checking authentication...") try: api = HfApi() user = api.whoami() print(f" ✓ Logged in as: {user['name']}") except Exception as e: print(f" ❌ Not logged in: {e}") return False # Upload print("\n4. Uploading dataset (CSV files only)...") print(" Note: Audio files should be uploaded separately via Space storage") try: dataset_dict.push_to_hub( DATASET_NAME, private=PRIVATE ) print(f"\n✅ CSV files uploaded successfully!") print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}") return True except Exception as e: print(f"\n❌ Error uploading: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = upload_csv_dataset() if success: print("\n" + "=" * 70) print("✅ UPLOAD COMPLETE") print("=" * 70) print("\nNext steps:") print("1. Upload Audio.zip to Space storage (one-time)") print("2. Space will extract audio files automatically") print("3. CSV data loads from HF Dataset automatically") else: print("\n❌ Upload failed.") sys.exit(1)