|
|
|
|
|
""" |
|
|
Simplified upload: Upload only CSV files to HF Dataset. |
|
|
Audio files will be handled via Space storage (upload Audio.zip once). |
|
|
""" |
|
|
import os |
|
|
import sys |
|
|
import pandas as pd |
|
|
from huggingface_hub import HfApi |
|
|
from datasets import Dataset, DatasetDict |
|
|
|
|
|
|
|
|
DATASET_NAME = "shaun3141/caribbean-voices-hackathon" |
|
|
PRIVATE = True |
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv") |
|
|
TEST_CSV = os.path.join(BASE_DIR, "Test.csv") |
|
|
|
|
|
def upload_csv_dataset(): |
|
|
"""Upload CSV files to HF Dataset (without audio)""" |
|
|
print("=" * 70) |
|
|
print("UPLOADING CSV FILES TO HUGGING FACE DATASET") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
if not os.path.exists(TRAIN_CSV): |
|
|
print(f"β Train.csv not found: {TRAIN_CSV}") |
|
|
return False |
|
|
|
|
|
if not os.path.exists(TEST_CSV): |
|
|
print(f"β Test.csv not found: {TEST_CSV}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n1. Loading CSV files...") |
|
|
train_df = pd.read_csv(TRAIN_CSV) |
|
|
test_df = pd.read_csv(TEST_CSV) |
|
|
print(f" Train: {len(train_df):,} samples") |
|
|
print(f" Test: {len(test_df):,} samples") |
|
|
|
|
|
|
|
|
if 'Transcription' not in test_df.columns: |
|
|
test_df['Transcription'] = "" |
|
|
|
|
|
|
|
|
print("\n2. Creating dataset structure...") |
|
|
train_dataset = Dataset.from_pandas(train_df) |
|
|
test_dataset = Dataset.from_pandas(test_df) |
|
|
|
|
|
dataset_dict = DatasetDict({ |
|
|
"train": train_dataset, |
|
|
"test": test_dataset |
|
|
}) |
|
|
|
|
|
|
|
|
print("\n3. Checking authentication...") |
|
|
try: |
|
|
api = HfApi() |
|
|
user = api.whoami() |
|
|
print(f" β Logged in as: {user['name']}") |
|
|
except Exception as e: |
|
|
print(f" β Not logged in: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n4. Uploading dataset (CSV files only)...") |
|
|
print(" Note: Audio files should be uploaded separately via Space storage") |
|
|
try: |
|
|
dataset_dict.push_to_hub( |
|
|
DATASET_NAME, |
|
|
private=PRIVATE |
|
|
) |
|
|
print(f"\nβ
CSV files uploaded successfully!") |
|
|
print(f" View at: https://huggingface.co/datasets/{DATASET_NAME}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"\nβ Error uploading: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
success = upload_csv_dataset() |
|
|
if success: |
|
|
print("\n" + "=" * 70) |
|
|
print("β
UPLOAD COMPLETE") |
|
|
print("=" * 70) |
|
|
print("\nNext steps:") |
|
|
print("1. Upload Audio.zip to Space storage (one-time)") |
|
|
print("2. Space will extract audio files automatically") |
|
|
print("3. CSV data loads from HF Dataset automatically") |
|
|
else: |
|
|
print("\nβ Upload failed.") |
|
|
sys.exit(1) |
|
|
|
|
|
|