test-export-runner / export_runner.py
dvilasuero's picture
dvilasuero HF Staff
Upload export_runner.py with huggingface_hub
6e6b484 verified
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs",
# "datasets",
# "huggingface_hub",
# "pandas",
# "pyarrow",
# ]
# ///
"""Export-only runner that reads logs from a source dataset and exports to parquet."""
import os
import sys
import tempfile
from pathlib import Path
from inspect_ai.analysis import evals_df, samples_df
from huggingface_hub import HfApi
def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None:
"""Export eval logs to parquet format and upload to HuggingFace dataset.
Args:
log_dir: HF filesystem path to logs (e.g., "hf://datasets/username/name/logs")
dataset_repo: Dataset repository ID (e.g., "datasets/username/name")
"""
# Get HF token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set")
api = HfApi(token=hf_token)
# Remove 'datasets/' prefix for API calls
repo_id = (
dataset_repo.replace("datasets/", "")
if dataset_repo.startswith("datasets/")
else dataset_repo
)
# Read evals dataframe
print(" Reading evals dataframe...")
print(f" Log directory: {log_dir}")
try:
evals = evals_df(logs=log_dir)
print(f" βœ“ Read {len(evals)} eval records")
except Exception as e:
print(f" βœ— Error reading evals: {e}")
raise
# Read samples dataframe
print(" Reading samples dataframe...")
try:
samples = samples_df(logs=log_dir)
print(f" βœ“ Read {len(samples)} sample records")
except Exception as e:
print(f" βœ— Error reading samples: {e}")
raise
# Write to temporary parquet files
with tempfile.TemporaryDirectory() as tmpdir:
evals_path = Path(tmpdir) / "evals.parquet"
samples_path = Path(tmpdir) / "samples.parquet"
print(f" Writing evals to parquet ({len(evals)} rows)...")
evals.to_parquet(evals_path, index=False, engine="pyarrow")
print(f" Writing samples to parquet ({len(samples)} rows)...")
samples.to_parquet(samples_path, index=False, engine="pyarrow")
# Upload parquet files to root (HuggingFace will auto-detect as separate data files)
# We use descriptive names so they can be loaded separately
print(" Uploading evals.parquet...")
api.upload_file(
path_or_fileobj=str(evals_path),
path_in_repo="evals.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
print(" Uploading samples.parquet...")
api.upload_file(
path_or_fileobj=str(samples_path),
path_in_repo="samples.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
print(
f" βœ“ Parquet files available at: https://huggingface.co/datasets/{repo_id}/tree/main"
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: export_runner.py <source_dataset_repo> <target_dataset_repo>")
print(" source_dataset_repo: Dataset repo to read logs from (e.g., datasets/username/name)")
print(" target_dataset_repo: Dataset repo to export parquet to (e.g., datasets/username/name)")
sys.exit(1)
source_dataset_repo = sys.argv[1]
target_dataset_repo = sys.argv[2]
# Construct log directory path for HF filesystem
if not source_dataset_repo.startswith("datasets/"):
source_dataset_repo = f"datasets/{source_dataset_repo}"
log_dir = f"hf://{source_dataset_repo}/logs"
print(f"Exporting logs from: {log_dir}")
print(f"Target dataset: {target_dataset_repo}")
try:
export_logs_to_parquet(log_dir, target_dataset_repo)
print("\nβœ“ Export completed successfully!")
except Exception as e:
import traceback
print(f"\nβœ— Export failed: {e}")
print(f"\nFull traceback:")
print(traceback.format_exc())
sys.exit(1)