caribbean-voices-hackathon / scripts /upload_csv_only.py
shaun3141's picture
Refactoring gradio app
2941e2c
#!/usr/bin/env python3
"""
Simplified upload: Upload only CSV files to HF Dataset.
Audio files will be handled via Space storage (upload Audio.zip once).
"""
import os
import sys
import pandas as pd
from huggingface_hub import HfApi
from datasets import Dataset, DatasetDict
# Configuration
DATASET_NAME = "shaun3141/caribbean-voices-hackathon"
PRIVATE = True
# Paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv")
TEST_CSV = os.path.join(BASE_DIR, "Test.csv")
def upload_csv_dataset():
"""Upload CSV files to HF Dataset (without audio)"""
print("=" * 70)
print("UPLOADING CSV FILES TO HUGGING FACE DATASET")
print("=" * 70)
# Check files
if not os.path.exists(TRAIN_CSV):
print(f"❌ Train.csv not found: {TRAIN_CSV}")
return False
if not os.path.exists(TEST_CSV):
print(f"❌ Test.csv not found: {TEST_CSV}")
return False
# Load CSVs
print("\n1. Loading CSV files...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
print(f" Train: {len(train_df):,} samples")
print(f" Test: {len(test_df):,} samples")
# Add empty Transcription column to test to match train features
if 'Transcription' not in test_df.columns:
test_df['Transcription'] = ""
# Create datasets (CSV only, no audio)
print("\n2. Creating dataset structure...")
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset_dict = DatasetDict({
"train": train_dataset,
"test": test_dataset
})
# Check login
print("\n3. Checking authentication...")
try:
api = HfApi()
user = api.whoami()
print(f" βœ“ Logged in as: {user['name']}")
except Exception as e:
print(f" ❌ Not logged in: {e}")
return False
# Upload
print("\n4. Uploading dataset (CSV files only)...")
print(" Note: Audio files should be uploaded separately via Space storage")
try:
dataset_dict.push_to_hub(
DATASET_NAME,
private=PRIVATE
)
print(f"\nβœ… CSV files uploaded successfully!")
print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}")
return True
except Exception as e:
print(f"\n❌ Error uploading: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = upload_csv_dataset()
if success:
print("\n" + "=" * 70)
print("βœ… UPLOAD COMPLETE")
print("=" * 70)
print("\nNext steps:")
print("1. Upload Audio.zip to Space storage (one-time)")
print("2. Space will extract audio files automatically")
print("3. CSV data loads from HF Dataset automatically")
else:
print("\n❌ Upload failed.")
sys.exit(1)