prometheus04
/

qwen3-4b-code-finetuned

+#!/usr/bin/env python3
+"""Phase 3: Evaluation on HumanEval+ and MBPP+"""
+import subprocess
+from pathlib import Path
+# Config
+CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
+RESULTS_DIR = Path("./qwen3_pipeline/results")
+RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+model_path = str(CKPT_DIR / "merged")
+print("="*70)
+print("PHASE 3: EVALUATION")
+print("="*70)
+print(f"\nModel: {model_path}\n")
+# Check if model exists
+if not (CKPT_DIR / "merged").exists():
+    print(f"❌ Model not found at {model_path}")
+    print(f"   Did Phase 2 complete successfully?")
+    exit(1)
+# HumanEval+
+print("="*70)
+print("HUMANEVAL+")
+print("="*70)
+humaneval_log = RESULTS_DIR / "humaneval_plus.log"
+cmd_humaneval = (
+    f'evalplus.evaluate '
+    f'--model "{model_path}" '
+    f'--dataset humaneval '
+    f'--backend vllm '
+    f'--greedy '
+    f'--tp 1'
+)
+print(f"\nRunning: {cmd_humaneval}\n")
+try:
+    result = subprocess.run(
+        cmd_humaneval,
+        shell=True,
+        capture_output=True,
+        text=True,
+        timeout=1800  # 30 min timeout
+    )
+    # Save full log
+    with open(humaneval_log, "w") as f:
+        f.write(result.stdout)
+        f.write("\n\n=== STDERR ===\n\n")
+        f.write(result.stderr)
+    # Print key results
+    print("\n" + "="*70)
+    print("HUMANEVAL+ RESULTS")
+    print("="*70)
+    for line in result.stdout.split("\n"):
+        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
+            print(line)
+    print(f"\nFull log: {humaneval_log}")
+except subprocess.TimeoutExpired:
+    print("❌ HumanEval+ timed out (30 min)")
+except Exception as e:
+    print(f"❌ HumanEval+ failed: {e}")
+# MBPP+
+print("\n" + "="*70)
+print("MBPP+")
+print("="*70)
+mbpp_log = RESULTS_DIR / "mbpp_plus.log"
+cmd_mbpp = (
+    f'evalplus.evaluate '
+    f'--model "{model_path}" '
+    f'--dataset mbpp '
+    f'--backend vllm '
+    f'--greedy '
+    f'--tp 1'
+)
+print(f"\nRunning: {cmd_mbpp}\n")
+try:
+    result = subprocess.run(
+        cmd_mbpp,
+        shell=True,
+        capture_output=True,
+        text=True,
+        timeout=1800  # 30 min timeout
+    )
+    # Save full log
+    with open(mbpp_log, "w") as f:
+        f.write(result.stdout)
+        f.write("\n\n=== STDERR ===\n\n")
+        f.write(result.stderr)
+    # Print key results
+    print("\n" + "="*70)
+    print("MBPP+ RESULTS")
+    print("="*70)
+    for line in result.stdout.split("\n"):
+        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
+            print(line)
+    print(f"\nFull log: {mbpp_log}")
+except subprocess.TimeoutExpired:
+    print("❌ MBPP+ timed out (30 min)")
+except Exception as e:
+    print(f"❌ MBPP+ failed: {e}")
+# Summary
+print("\n" + "="*70)
+print("✓ PHASE 3 COMPLETE")
+print("="*70)
+print(f"\nResults saved to: {RESULTS_DIR}/")
+print(f"  - {humaneval_log}")
+print(f"  - {mbpp_log}")
+print(f"\n➡️  Next: python phase4_codet.py")