prometheus04 commited on
Commit
289b193
·
verified ·
1 Parent(s): 2e31dbb

Upload training_scripts/phase3_eval.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_scripts/phase3_eval.py +130 -0
training_scripts/phase3_eval.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Phase 3: Evaluation on HumanEval+ and MBPP+"""
3
+
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ # Config
8
+ CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
9
+ RESULTS_DIR = Path("./qwen3_pipeline/results")
10
+ RESULTS_DIR.mkdir(parents=True, exist_ok=True)
11
+
12
+ model_path = str(CKPT_DIR / "merged")
13
+
14
+ print("="*70)
15
+ print("PHASE 3: EVALUATION")
16
+ print("="*70)
17
+ print(f"\nModel: {model_path}\n")
18
+
19
+ # Check if model exists
20
+ if not (CKPT_DIR / "merged").exists():
21
+ print(f"❌ Model not found at {model_path}")
22
+ print(f" Did Phase 2 complete successfully?")
23
+ exit(1)
24
+
25
+ # HumanEval+
26
+ print("="*70)
27
+ print("HUMANEVAL+")
28
+ print("="*70)
29
+
30
+ humaneval_log = RESULTS_DIR / "humaneval_plus.log"
31
+
32
+ cmd_humaneval = (
33
+ f'evalplus.evaluate '
34
+ f'--model "{model_path}" '
35
+ f'--dataset humaneval '
36
+ f'--backend vllm '
37
+ f'--greedy '
38
+ f'--tp 1'
39
+ )
40
+
41
+ print(f"\nRunning: {cmd_humaneval}\n")
42
+
43
+ try:
44
+ result = subprocess.run(
45
+ cmd_humaneval,
46
+ shell=True,
47
+ capture_output=True,
48
+ text=True,
49
+ timeout=1800 # 30 min timeout
50
+ )
51
+
52
+ # Save full log
53
+ with open(humaneval_log, "w") as f:
54
+ f.write(result.stdout)
55
+ f.write("\n\n=== STDERR ===\n\n")
56
+ f.write(result.stderr)
57
+
58
+ # Print key results
59
+ print("\n" + "="*70)
60
+ print("HUMANEVAL+ RESULTS")
61
+ print("="*70)
62
+
63
+ for line in result.stdout.split("\n"):
64
+ if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
65
+ print(line)
66
+
67
+ print(f"\nFull log: {humaneval_log}")
68
+
69
+ except subprocess.TimeoutExpired:
70
+ print("❌ HumanEval+ timed out (30 min)")
71
+ except Exception as e:
72
+ print(f"❌ HumanEval+ failed: {e}")
73
+
74
+ # MBPP+
75
+ print("\n" + "="*70)
76
+ print("MBPP+")
77
+ print("="*70)
78
+
79
+ mbpp_log = RESULTS_DIR / "mbpp_plus.log"
80
+
81
+ cmd_mbpp = (
82
+ f'evalplus.evaluate '
83
+ f'--model "{model_path}" '
84
+ f'--dataset mbpp '
85
+ f'--backend vllm '
86
+ f'--greedy '
87
+ f'--tp 1'
88
+ )
89
+
90
+ print(f"\nRunning: {cmd_mbpp}\n")
91
+
92
+ try:
93
+ result = subprocess.run(
94
+ cmd_mbpp,
95
+ shell=True,
96
+ capture_output=True,
97
+ text=True,
98
+ timeout=1800 # 30 min timeout
99
+ )
100
+
101
+ # Save full log
102
+ with open(mbpp_log, "w") as f:
103
+ f.write(result.stdout)
104
+ f.write("\n\n=== STDERR ===\n\n")
105
+ f.write(result.stderr)
106
+
107
+ # Print key results
108
+ print("\n" + "="*70)
109
+ print("MBPP+ RESULTS")
110
+ print("="*70)
111
+
112
+ for line in result.stdout.split("\n"):
113
+ if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
114
+ print(line)
115
+
116
+ print(f"\nFull log: {mbpp_log}")
117
+
118
+ except subprocess.TimeoutExpired:
119
+ print("❌ MBPP+ timed out (30 min)")
120
+ except Exception as e:
121
+ print(f"❌ MBPP+ failed: {e}")
122
+
123
+ # Summary
124
+ print("\n" + "="*70)
125
+ print("✓ PHASE 3 COMPLETE")
126
+ print("="*70)
127
+ print(f"\nResults saved to: {RESULTS_DIR}/")
128
+ print(f" - {humaneval_log}")
129
+ print(f" - {mbpp_log}")
130
+ print(f"\n➡️ Next: python phase4_codet.py")