Source from repo
Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page
Files
154
Skill
n/a
Size
976.2 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
finetuning/scripts/calibrate_grader.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code249 linesFree
finetuning/scripts/calibrate_grader.py
1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9calibrate_grader.py — Calibrate RFT grader pass_threshold before submitting a job.
10 
11Runs the base model on your training/validation data, scores each output
12with your Python grader, and recommends the optimal pass_threshold.
13 
14Usage:
15  python calibrate_grader.py --base-url <url> --api-key KEY \
16      --model o4-mini --data train.jsonl --grader grader.py --n 30
17 
18  python calibrate_grader.py --model gpt-4.1-mini --data val.jsonl \
19      --grader grader.py --n 20 --tools '[{"name": "search", "server_url": "https://..."}]'
20"""
21 
22import argparse
23import json
24import os
25import random
26import sys
27 
28try:
29    sys.stdout.reconfigure(encoding="utf-8")
30    sys.stderr.reconfigure(encoding="utf-8")
31except (AttributeError, OSError):
32    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
33import time
34 
35sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
36from common import HelpOnErrorParser, get_clients
37 
38 
39def load_grader(grader_path):
40    """Load and compile a Python grader file. Returns the grade() function.
41 
42    SECURITY: This executes the grader file as Python code. Only load grader
43    files that you wrote or reviewed — never load untrusted files from the
44    internet or unknown sources. The grader runs with the same permissions as
45    this script.
46    """
47    grader_path = os.path.abspath(grader_path)
48    if not os.path.isfile(grader_path):
49        print(f"❌ Grader file not found: {grader_path}")
50        sys.exit(1)
51    with open(grader_path, encoding="utf-8") as f:
52        source = f.read()
53    namespace = {}
54    exec(compile(source, grader_path, "exec"), namespace)
55    if "grade" not in namespace:
56        print(f"❌ Grader file must define a grade(sample, item) function")
57        sys.exit(1)
58    return namespace["grade"]
59 
60 
61def run_model(client, model, messages, tools_schema=None, max_retries=3):
62    """Run the model and return (output_text, output_tools)."""
63    kwargs = {"model": model, "messages": messages, "max_completion_tokens": 4096}
64    if tools_schema:
65        kwargs["tools"] = tools_schema
66 
67    for attempt in range(max_retries):
68        try:
69            resp = client.chat.completions.create(**kwargs)
70            msg = resp.choices[0].message
71            output_text = msg.content or ""
72            output_tools = []
73            if msg.tool_calls:
74                output_tools = [
75                    {"type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}}
76                    for tc in msg.tool_calls
77                ]
78            return output_text, output_tools
79        except Exception as e:
80            if "429" in str(e) and attempt < max_retries - 1:
81                time.sleep(5 * (attempt + 1))
82            else:
83                return f"ERROR: {e}", []
84    return "ERROR: max retries", []
85 
86 
87def calibrate(client, model, data, grade_fn, tools_schema=None, n=30):
88    """Run base model on data, score with grader, output threshold analysis."""
89    if not data:
90        print("No examples to evaluate. Check your data file.")
91        return
92 
93    # Sample if dataset is larger than n
94    if len(data) > n:
95        data = random.sample(data, n)
96 
97    print(f"Running {model} on {len(data)} examples...\n")
98 
99    scores = []
100    for i, ex in enumerate(data):
101        messages = ex["messages"]
102        user_msg = messages[-1]["content"] if messages else ""
103 
104        output_text, output_tools = run_model(client, model, messages, tools_schema)
105 
106        if output_text.startswith("ERROR:"):
107            print(f"  [{i+1:3d}] ❌ {output_text[:60]}")
108            scores.append(0.0)
109            continue
110 
111        # Build sample dict matching what the grader expects
112        sample = {"output_text": output_text, "output_tools": output_tools}
113 
114        # Build item dict from all fields in the training example
115        item = {k: v for k, v in ex.items() if k != "messages"}
116 
117        try:
118            score = grade_fn(sample, item)
119        except Exception as e:
120            print(f"  [{i+1:3d}] ❌ Grader error: {e}")
121            scores.append(0.0)
122            continue
123 
124        status = "✅" if score >= 0.9 else ("⚠️" if score >= 0.5 else "❌")
125        print(f"  [{i+1:3d}] {score:.3f} {status}  {user_msg[:55]}")
126        scores.append(score)
127 
128        time.sleep(0.5)  # Rate limiting
129 
130    # Analysis
131    scored = [s for s in scores if s is not None]
132    if not scored:
133        print("\n❌ No examples were scored successfully. Check model access and data format.")
134        return
135    avg = sum(scored) / len(scored)
136    print(f"\n{'='*60}")
137    print(f"  BASE MODEL GRADER CALIBRATION ({len(scores)} examples)")
138    print(f"  Average score: {avg:.1%}")
139    print(f"{'='*60}")
140 
141    print(f"\n  {'Threshold':>10} {'Pass Rate':>10} {'Fail Rate':>10} {'Signal':>20}")
142    print(f"  {'-'*10} {'-'*10} {'-'*10} {'-'*20}")
143 
144    best_threshold = None
145    best_distance = float("inf")
146 
147    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0]:
148        pass_rate = sum(1 for s in scored if s >= threshold) / len(scored)
149        fail_rate = 1 - pass_rate
150 
151        if 0.25 <= fail_rate <= 0.50:
152            signal = "✅ Good (25-50%)"
153            distance = abs(fail_rate - 0.35)  # Ideal is ~35%
154            if distance < best_distance:
155                best_distance = distance
156                best_threshold = threshold
157        elif fail_rate < 0.10:
158            signal = "❌ Too easy"
159        elif fail_rate < 0.25:
160            signal = "⚠️ Weak signal"
161        elif fail_rate <= 0.70:
162            signal = "⚠️ Harsh"
163        else:
164            signal = "❌ Too hard"
165 
166        print(f"  {threshold:>10.2f} {pass_rate:>9.0%} {fail_rate:>9.0%} {signal:>20}")
167 
168    if best_threshold:
169        print(f"\n  ✅ Recommended pass_threshold: {best_threshold}")
170        print(f"     (~{sum(1 for s in scores if s < best_threshold)/len(scores):.0%} failure rate)")
171    else:
172        print(f"\n  ⚠️ No threshold in the ideal 25-50% failure range.")
173        print(f"     Consider adjusting your grader scoring dimensions.")
174 
175    # Score distribution
176    print(f"\n  Score distribution:")
177    buckets = {"0.0-0.2": 0, "0.2-0.4": 0, "0.4-0.6": 0, "0.6-0.8": 0, "0.8-0.9": 0, "0.9-1.0": 0}
178    for s in scores:
179        if s < 0.2: buckets["0.0-0.2"] += 1
180        elif s < 0.4: buckets["0.2-0.4"] += 1
181        elif s < 0.6: buckets["0.4-0.6"] += 1
182        elif s < 0.8: buckets["0.6-0.8"] += 1
183        elif s < 0.9: buckets["0.8-0.9"] += 1
184        else: buckets["0.9-1.0"] += 1
185    for bucket, count in buckets.items():
186        bar = "█" * count
187        print(f"    {bucket}: {count:3d} {bar}")
188 
189 
190def build_parser():
191    parser = HelpOnErrorParser(
192        description="Calibrate RFT grader pass_threshold on base model outputs",
193        epilog=(
194            "Example:\n"
195            "  python calibrate_grader.py --model o4-mini --data train.jsonl --grader grader.py\n"
196            "  python calibrate_grader.py --model o4-mini --data val.jsonl --grader grader.py --n 20"
197        ),
198        formatter_class=argparse.RawTextHelpFormatter,
199    )
200    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"), help="Project /v1/ endpoint URL")
201    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
202                        help="Azure OpenAI endpoint (fallback)")
203    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"), help="API key")
204    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
205                        help="Azure AI project endpoint")
206    parser.add_argument("--model", required=True, help="Base model deployment name to calibrate against")
207    parser.add_argument("--data", required=True, help="Path to training or validation JSONL file")
208    parser.add_argument("--grader", required=True, help="Path to Python grader file (must define grade(sample, item))")
209    parser.add_argument("--n", type=int, default=30, help="Number of examples to evaluate (default: 30)")
210    parser.add_argument("--tools", default=None,
211                        help="Tool schemas as JSON array (for tool-calling models). Pass as a JSON string.")
212    parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling (default: 42)")
213    return parser
214 
215 
216if __name__ == "__main__":
217    parser = build_parser()
218    if len(sys.argv) == 1:
219        parser.print_help()
220        sys.exit(0)
221 
222    args = parser.parse_args()
223    random.seed(args.seed)
224 
225    client, method = get_clients(base_url=args.base_url, azure_endpoint=args.endpoint, project_endpoint=args.project_endpoint, api_key=args.api_key)
226 
227    # Load data
228    with open(args.data, encoding="utf-8") as f:
229        data = []
230        for ln, line in enumerate(f, 1):
231            if not line.strip():
232                continue
233            try:
234                data.append(json.loads(line))
235            except json.JSONDecodeError as e:
236                print(f"⚠️ Skipping malformed JSON on line {ln}: {e}")
237    print(f"Loaded {len(data)} examples from {args.data}")
238 
239    # Load grader
240    grade_fn = load_grader(args.grader)
241    print(f"Loaded grader from {args.grader}")
242 
243    # Parse tools if provided
244    tools_schema = None
245    if args.tools:
246        tools_schema = json.loads(args.tools)
247 
248    calibrate(client, args.model, data, grade_fn, tools_schema, args.n)
249
Preparing the source view

Microsoft Foundry Skill

finetuning/scripts/calibrate_grader.py