File size: 2,964 Bytes
984c806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
"""
Simplified upload: Upload only CSV files to HF Dataset.
Audio files will be handled via Space storage (upload Audio.zip once).
"""
import os
import sys
import pandas as pd
from huggingface_hub import HfApi
from datasets import Dataset, DatasetDict

# Configuration
DATASET_NAME = "shaun3141/caribbean-voices-hackathon"
PRIVATE = True

# Paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv")
TEST_CSV = os.path.join(BASE_DIR, "Test.csv")

def upload_csv_dataset():
    """Upload CSV files to HF Dataset (without audio)"""
    print("=" * 70)
    print("UPLOADING CSV FILES TO HUGGING FACE DATASET")
    print("=" * 70)
    
    # Check files
    if not os.path.exists(TRAIN_CSV):
        print(f"❌ Train.csv not found: {TRAIN_CSV}")
        return False
    
    if not os.path.exists(TEST_CSV):
        print(f"❌ Test.csv not found: {TEST_CSV}")
        return False
    
    # Load CSVs
    print("\n1. Loading CSV files...")
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    print(f"   Train: {len(train_df):,} samples")
    print(f"   Test: {len(test_df):,} samples")
    
    # Add empty Transcription column to test to match train features
    if 'Transcription' not in test_df.columns:
        test_df['Transcription'] = ""
    
    # Create datasets (CSV only, no audio)
    print("\n2. Creating dataset structure...")
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })
    
    # Check login
    print("\n3. Checking authentication...")
    try:
        api = HfApi()
        user = api.whoami()
        print(f"   βœ“ Logged in as: {user['name']}")
    except Exception as e:
        print(f"   ❌ Not logged in: {e}")
        return False
    
    # Upload
    print("\n4. Uploading dataset (CSV files only)...")
    print("   Note: Audio files should be uploaded separately via Space storage")
    try:
        dataset_dict.push_to_hub(
            DATASET_NAME,
            private=PRIVATE
        )
        print(f"\nβœ… CSV files uploaded successfully!")
        print(f"   View at: https://huggingface.co/datasets/{DATASET_NAME}")
        return True
    except Exception as e:
        print(f"\n❌ Error uploading: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    success = upload_csv_dataset()
    if success:
        print("\n" + "=" * 70)
        print("βœ… UPLOAD COMPLETE")
        print("=" * 70)
        print("\nNext steps:")
        print("1. Upload Audio.zip to Space storage (one-time)")
        print("2. Space will extract audio files automatically")
        print("3. CSV data loads from HF Dataset automatically")
    else:
        print("\n❌ Upload failed.")
        sys.exit(1)