dvilasuero HF Staff commited on
Commit
6e6b484
Β·
verified Β·
1 Parent(s): e08e1ac

Upload export_runner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. export_runner.py +125 -0
export_runner.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs",
6
+ # "datasets",
7
+ # "huggingface_hub",
8
+ # "pandas",
9
+ # "pyarrow",
10
+ # ]
11
+ # ///
12
+ """Export-only runner that reads logs from a source dataset and exports to parquet."""
13
+
14
+ import os
15
+ import sys
16
+ import tempfile
17
+ from pathlib import Path
18
+
19
+ from inspect_ai.analysis import evals_df, samples_df
20
+ from huggingface_hub import HfApi
21
+
22
+
23
+ def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None:
24
+ """Export eval logs to parquet format and upload to HuggingFace dataset.
25
+
26
+ Args:
27
+ log_dir: HF filesystem path to logs (e.g., "hf://datasets/username/name/logs")
28
+ dataset_repo: Dataset repository ID (e.g., "datasets/username/name")
29
+ """
30
+ # Get HF token from environment
31
+ hf_token = os.getenv("HF_TOKEN")
32
+ if not hf_token:
33
+ raise ValueError("HF_TOKEN environment variable not set")
34
+
35
+ api = HfApi(token=hf_token)
36
+
37
+ # Remove 'datasets/' prefix for API calls
38
+ repo_id = (
39
+ dataset_repo.replace("datasets/", "")
40
+ if dataset_repo.startswith("datasets/")
41
+ else dataset_repo
42
+ )
43
+
44
+ # Read evals dataframe
45
+ print(" Reading evals dataframe...")
46
+ print(f" Log directory: {log_dir}")
47
+ try:
48
+ evals = evals_df(logs=log_dir)
49
+ print(f" βœ“ Read {len(evals)} eval records")
50
+ except Exception as e:
51
+ print(f" βœ— Error reading evals: {e}")
52
+ raise
53
+
54
+ # Read samples dataframe
55
+ print(" Reading samples dataframe...")
56
+ try:
57
+ samples = samples_df(logs=log_dir)
58
+ print(f" βœ“ Read {len(samples)} sample records")
59
+ except Exception as e:
60
+ print(f" βœ— Error reading samples: {e}")
61
+ raise
62
+
63
+ # Write to temporary parquet files
64
+ with tempfile.TemporaryDirectory() as tmpdir:
65
+ evals_path = Path(tmpdir) / "evals.parquet"
66
+ samples_path = Path(tmpdir) / "samples.parquet"
67
+
68
+ print(f" Writing evals to parquet ({len(evals)} rows)...")
69
+ evals.to_parquet(evals_path, index=False, engine="pyarrow")
70
+
71
+ print(f" Writing samples to parquet ({len(samples)} rows)...")
72
+ samples.to_parquet(samples_path, index=False, engine="pyarrow")
73
+
74
+ # Upload parquet files to root (HuggingFace will auto-detect as separate data files)
75
+ # We use descriptive names so they can be loaded separately
76
+ print(" Uploading evals.parquet...")
77
+ api.upload_file(
78
+ path_or_fileobj=str(evals_path),
79
+ path_in_repo="evals.parquet",
80
+ repo_id=repo_id,
81
+ repo_type="dataset",
82
+ token=hf_token,
83
+ )
84
+
85
+ print(" Uploading samples.parquet...")
86
+ api.upload_file(
87
+ path_or_fileobj=str(samples_path),
88
+ path_in_repo="samples.parquet",
89
+ repo_id=repo_id,
90
+ repo_type="dataset",
91
+ token=hf_token,
92
+ )
93
+
94
+ print(
95
+ f" βœ“ Parquet files available at: https://huggingface.co/datasets/{repo_id}/tree/main"
96
+ )
97
+
98
+
99
+ if __name__ == "__main__":
100
+ if len(sys.argv) < 3:
101
+ print("Usage: export_runner.py <source_dataset_repo> <target_dataset_repo>")
102
+ print(" source_dataset_repo: Dataset repo to read logs from (e.g., datasets/username/name)")
103
+ print(" target_dataset_repo: Dataset repo to export parquet to (e.g., datasets/username/name)")
104
+ sys.exit(1)
105
+
106
+ source_dataset_repo = sys.argv[1]
107
+ target_dataset_repo = sys.argv[2]
108
+
109
+ # Construct log directory path for HF filesystem
110
+ if not source_dataset_repo.startswith("datasets/"):
111
+ source_dataset_repo = f"datasets/{source_dataset_repo}"
112
+ log_dir = f"hf://{source_dataset_repo}/logs"
113
+
114
+ print(f"Exporting logs from: {log_dir}")
115
+ print(f"Target dataset: {target_dataset_repo}")
116
+
117
+ try:
118
+ export_logs_to_parquet(log_dir, target_dataset_repo)
119
+ print("\nβœ“ Export completed successfully!")
120
+ except Exception as e:
121
+ import traceback
122
+ print(f"\nβœ— Export failed: {e}")
123
+ print(f"\nFull traceback:")
124
+ print(traceback.format_exc())
125
+ sys.exit(1)