Spaces:
No application file
No application file
File size: 4,170 Bytes
6e6b484 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs",
# "datasets",
# "huggingface_hub",
# "pandas",
# "pyarrow",
# ]
# ///
"""Export-only runner that reads logs from a source dataset and exports to parquet."""
import os
import sys
import tempfile
from pathlib import Path
from inspect_ai.analysis import evals_df, samples_df
from huggingface_hub import HfApi
def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None:
"""Export eval logs to parquet format and upload to HuggingFace dataset.
Args:
log_dir: HF filesystem path to logs (e.g., "hf://datasets/username/name/logs")
dataset_repo: Dataset repository ID (e.g., "datasets/username/name")
"""
# Get HF token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set")
api = HfApi(token=hf_token)
# Remove 'datasets/' prefix for API calls
repo_id = (
dataset_repo.replace("datasets/", "")
if dataset_repo.startswith("datasets/")
else dataset_repo
)
# Read evals dataframe
print(" Reading evals dataframe...")
print(f" Log directory: {log_dir}")
try:
evals = evals_df(logs=log_dir)
print(f" β Read {len(evals)} eval records")
except Exception as e:
print(f" β Error reading evals: {e}")
raise
# Read samples dataframe
print(" Reading samples dataframe...")
try:
samples = samples_df(logs=log_dir)
print(f" β Read {len(samples)} sample records")
except Exception as e:
print(f" β Error reading samples: {e}")
raise
# Write to temporary parquet files
with tempfile.TemporaryDirectory() as tmpdir:
evals_path = Path(tmpdir) / "evals.parquet"
samples_path = Path(tmpdir) / "samples.parquet"
print(f" Writing evals to parquet ({len(evals)} rows)...")
evals.to_parquet(evals_path, index=False, engine="pyarrow")
print(f" Writing samples to parquet ({len(samples)} rows)...")
samples.to_parquet(samples_path, index=False, engine="pyarrow")
# Upload parquet files to root (HuggingFace will auto-detect as separate data files)
# We use descriptive names so they can be loaded separately
print(" Uploading evals.parquet...")
api.upload_file(
path_or_fileobj=str(evals_path),
path_in_repo="evals.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
print(" Uploading samples.parquet...")
api.upload_file(
path_or_fileobj=str(samples_path),
path_in_repo="samples.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
print(
f" β Parquet files available at: https://huggingface.co/datasets/{repo_id}/tree/main"
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: export_runner.py <source_dataset_repo> <target_dataset_repo>")
print(" source_dataset_repo: Dataset repo to read logs from (e.g., datasets/username/name)")
print(" target_dataset_repo: Dataset repo to export parquet to (e.g., datasets/username/name)")
sys.exit(1)
source_dataset_repo = sys.argv[1]
target_dataset_repo = sys.argv[2]
# Construct log directory path for HF filesystem
if not source_dataset_repo.startswith("datasets/"):
source_dataset_repo = f"datasets/{source_dataset_repo}"
log_dir = f"hf://{source_dataset_repo}/logs"
print(f"Exporting logs from: {log_dir}")
print(f"Target dataset: {target_dataset_repo}")
try:
export_logs_to_parquet(log_dir, target_dataset_repo)
print("\nβ Export completed successfully!")
except Exception as e:
import traceback
print(f"\nβ Export failed: {e}")
print(f"\nFull traceback:")
print(traceback.format_exc())
sys.exit(1)
|