""" Script to run the agent on Unit 4 questions and optionally submit results to the course scoring API. Usage: # Dry run (no submit) on first 5 questions: python evaluate_and_submit.py --limit 5 # Submit results (requires username and Space URL): python evaluate_and_submit.py --submit --username YOUR_HF_USERNAME \\ --agent-code-url https://huggingface.co/spaces/alfulanny/huggingface_ai_final/tree/main Notes: - The scoring API expects exact-match answers, so make sure the agent returns only the final answer without extra formatting. - Ensure you've run `huggingface-cli login` before submission. """ import argparse import os import sys import time from typing import Any, Dict, List from evaluation_client import ScoringAPIClient from code_agent import run_agent def extract_prompt_from_question(q: Dict[str, Any]) -> str: """Extract the actual question/prompt from a question dict.""" for key in ("question", "prompt", "input", "text", "task"): if key in q and isinstance(q[key], str): return q[key] return str(q) def main(argv: List[str]): parser = argparse.ArgumentParser(description="Evaluate agent on course questions and optionally submit.") parser.add_argument("--limit", type=int, default=0, help="Max questions to process (0=all)") parser.add_argument("--submit", action="store_true", help="Submit answers to scoring API") parser.add_argument("--username", type=str, default=os.environ.get("HF_USERNAME"), help="HF username for submission") parser.add_argument("--agent-code-url", type=str, default=os.environ.get("AGENT_CODE_URL"), help="Public Space URL for your agent") args = parser.parse_args(argv) client = ScoringAPIClient() print("Fetching questions from scoring API...") questions = client.get_questions() if not questions: print("ERROR: No questions returned by the API.") sys.exit(1) if args.limit > 0: questions = questions[:args.limit] print(f"Processing {len(questions)} questions...") answers = [] for idx, q in enumerate(questions, 1): task_id = q.get("task_id") or q.get("id") or q.get("taskId") prompt = extract_prompt_from_question(q) print(f"\n[{idx}/{len(questions)}] Task {task_id}") print(f" Prompt: {prompt[:100]}...") try: ans = run_agent(prompt) ans = ans.strip() answers.append({"task_id": task_id, "submitted_answer": ans}) print(f" Answer: {ans[:80]}...") except Exception as e: print(f" ERROR: {type(e).__name__}: {str(e)[:100]}") # Still add an error answer to maintain alignment answers.append({"task_id": task_id, "submitted_answer": f"(error) {type(e).__name__}"}) # Polite pacing to avoid rate limits time.sleep(0.5) print(f"\nāœ“ Prepared answers for {len(answers)} tasks") if args.submit: if not args.username: print("ERROR: --submit requires --username (or set HF_USERNAME env var)") sys.exit(1) if not args.agent_code_url: print("ERROR: --submit requires --agent-code-url (or set AGENT_CODE_URL env var)") sys.exit(1) print(f"\nSubmitting {len(answers)} answers as user '{args.username}'...") print(f"Agent Code URL: {args.agent_code_url}") try: resp = client.submit(username=args.username, agent_code=args.agent_code_url, answers=answers) print(f"āœ“ Submission successful!") print(f"Response: {resp}") except Exception as e: print(f"ERROR: Submission failed: {e}") sys.exit(1) else: print("\nDry run complete. To submit, re-run with:") print(f" python evaluate_and_submit.py --submit --username YOUR_USERNAME \\") print(f" --agent-code-url https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE") if __name__ == "__main__": main(sys.argv[1:])