Source from repo
Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page
Files
155
Skill
n/a
Size
976.3 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
finetuning/scripts/calibrate_grader.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code249 linesFree
finetuning/scripts/calibrate_grader.py
1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9calibrate_grader.py — Calibrate RFT grader pass_threshold before submitting a job.
10 
11Runs the base model on your training/validation data, scores each output
12with your Python grader, and recommends the optimal pass_threshold.
13 
14Usage:
15  python calibrate_grader.py --base-url <url> --api-key KEY \
16      --model o4-mini --data train.jsonl --grader grader.py --n 30
17 
18  python calibrate_grader.py --model gpt-4.1-mini --data val.jsonl \
19      --grader grader.py --n 20 --tools '[{"name": "search", "server_url": "https://..."}]'
20"""
21 
22import argparse
23import json
24import os
25import random
26import sys
27 
28try:
29    sys.stdout.reconfigure(encoding="utf-8")
30    sys.stderr.reconfigure(encoding="utf-8")
31except (AttributeError, OSError):
32    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
33import time
34 
35sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
36from common import HelpOnErrorParser, get_clients
37 
38 
39def load_grader(grader_path):
40    """Load and compile a Python grader file. Returns the grade() function.
41 
42    SECURITY: This executes the grader file as Python code. Only load grader
43    files that you wrote or reviewed — never load untrusted files from the
44    internet or unknown sources. The grader runs with the same permissions as
45    this script.
46    """
47    grader_path = os.path.abspath(grader_path)
48    if not os.path.isfile(grader_path):
49        print(f"❌ Grader file not found: {grader_path}")
50        sys.exit(1)
51    with open(grader_path, encoding="utf-8") as f:
52        source = f.read()
53    namespace = {}
54    exec(compile(source, grader_path, "exec"), namespace)
55    if "grade" not in namespace:
56        print(f"❌ Grader file must define a grade(sample, item) function")
57        sys.exit(1)
58    return namespace["grade"]
59 
60 
61def run_model(client, model, messages, tools_schema=None, max_retries=3):
62    """Run the model and return (output_text, output_tools)."""
63    kwargs = {"model": model, "messages": messages, "max_completion_tokens": 4096}
64    if tools_schema:
65        kwargs["tools"] = tools_schema
66 
67    for attempt in range(max_retries):
68        try:
69            resp = client.chat.completions.create(**kwargs)
70            msg = resp.choices[0].message
71            output_text = msg.content or ""
72            output_tools = []
73            if msg.tool_calls:
74                output_tools = [
75                    {"type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}}
76                    for tc in msg.tool_calls
77                ]
78            return output_text, output_tools
79        except Exception as e:
80            if "429" in str(e) and attempt < max_retries - 1:
81                time.sleep(5 * (attempt + 1))
82            else:
83                return f"ERROR: {e}", []
84    return "ERROR: max retries", []
85 
86 
87def calibrate(client, model, data, grade_fn, tools_schema=None, n=30):
88    """Run base model on data, score with grader, output threshold analysis."""
89    if not data:
90        print("No examples to evaluate. Check your data file.")
91        return
92 
93    # Sample if dataset is larger than n
94    if len(data) > n:
95        data = random.sample(data, n)
96 
97    print(f"Running {model} on {len(data)} examples...\n")
98 
99    scores = []
100    for i, ex in enumerate(data):
101        messages = ex["messages"]
102        user_msg = messages[-1]["content"] if messages else ""
103 
104        output_text, output_tools = run_model(client, model, messages, tools_schema)
105 
106        if output_text.startswith("ERROR:"):
107            print(f"  [{i+1:3d}] ❌ {output_text[:60]}")
108            scores.append(0.0)
109            continue
110 
111        # Build sample dict matching what the grader expects
112        sample = {"output_text": output_text, "output_tools": output_tools}
113 
114        # Build item dict from all fields in the training example
115        item = {k: v for k, v in ex.items() if k != "messages"}
116 
117        try:
118            score = grade_fn(sample, item)
119        except Exception as e:
120            print(f"  [{i+1:3d}] ❌ Grader error: {e}")
121            scores.append(0.0)
122            continue
123 
124        status = "✅" if score >= 0.9 else ("⚠️" if score >= 0.5 else "❌")
125        print(f"  [{i+1:3d}] {score:.3f} {status}  {user_msg[:55]}")
126        scores.append(score)
127 
128        time.sleep(0.5)  # Rate limiting
129 
130    # Analysis
131    scored = [s for s in scores if s is not None]
132    if not scored:
133        print("\n❌ No examples were scored successfully. Check model access and data format.")
134        return
135    avg = sum(scored) / len(scored)
136    print(f"\n{'='*60}")
137    print(f"  BASE MODEL GRADER CALIBRATION ({len(scores)} examples)")
138    print(f"  Average score: {avg:.1%}")
139    print(f"{'='*60}")
140 
141    print(f"\n  {'Threshold':>10} {'Pass Rate':>10} {'Fail Rate':>10} {'Signal':>20}")
142    print(f"  {'-'*10} {'-'*10} {'-'*10} {'-'*20}")
143 
144    best_threshold = None
145    best_distance = float("inf")
146 
147    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0]:
148        pass_rate = sum(1 for s in scored if s >= threshold) / len(scored)
149        fail_rate = 1 - pass_rate
150 
151        if 0.25 <= fail_rate <= 0.50:
152            signal = "✅ Good (25-50%)"
153            distance = abs(fail_rate - 0.35)  # Ideal is ~35%
154            if distance < best_distance:
155                best_distance = distance
156                best_threshold = threshold
157        elif fail_rate < 0.10:
158            signal = "❌ Too easy"
159        elif fail_rate < 0.25:
160            signal = "⚠️ Weak signal"
161        elif fail_rate <= 0.70:
162            signal = "⚠️ Harsh"
163        else:
164            signal = "❌ Too hard"
165 
166        print(f"  {threshold:>10.2f} {pass_rate:>9.0%} {fail_rate:>9.0%} {signal:>20}")
167 
168    if best_threshold:
169        print(f"\n  ✅ Recommended pass_threshold: {best_threshold}")
170        print(f"     (~{sum(1 for s in scores if s < best_threshold)/len(scores):.0%} failure rate)")
171    else:
172        print(f"\n  ⚠️ No threshold in the ideal 25-50% failure range.")
173        print(f"     Consider adjusting your grader scoring dimensions.")
174 
175    # Score distribution
176    print(f"\n  Score distribution:")
177    buckets = {"0.0-0.2": 0, "0.2-0.4": 0, "0.4-0.6": 0, "0.6-0.8": 0, "0.8-0.9": 0, "0.9-1.0": 0}
178    for s in scores:
179        if s < 0.2: buckets["0.0-0.2"] += 1
180        elif s < 0.4: buckets["0.2-0.4"] += 1
181        elif s < 0.6: buckets["0.4-0.6"] += 1
182        elif s < 0.8: buckets["0.6-0.8"] += 1
183        elif s < 0.9: buckets["0.8-0.9"] += 1
184        else: buckets["0.9-1.0"] += 1
185    for bucket, count in buckets.items():
186        bar = "█" * count
187        print(f"    {bucket}: {count:3d} {bar}")
188 
189 
190def build_parser():
191    parser = HelpOnErrorParser(
192        description="Calibrate RFT grader pass_threshold on base model outputs",
193        epilog=(
194            "Example:\n"
195            "  python calibrate_grader.py --model o4-mini --data train.jsonl --grader grader.py\n"
196            "  python calibrate_grader.py --model o4-mini --data val.jsonl --grader grader.py --n 20"
197        ),
198        formatter_class=argparse.RawTextHelpFormatter,
199    )
200    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"), help="Project /v1/ endpoint URL")
201    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
202                        help="Azure OpenAI endpoint (fallback)")
203    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"), help="API key")
204    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
205                        help="Azure AI project endpoint")
206    parser.add_argument("--model", required=True, help="Base model deployment name to calibrate against")
207    parser.add_argument("--data", required=True, help="Path to training or validation JSONL file")
208    parser.add_argument("--grader", required=True, help="Path to Python grader file (must define grade(sample, item))")
209    parser.add_argument("--n", type=int, default=30, help="Number of examples to evaluate (default: 30)")
210    parser.add_argument("--tools", default=None,
211                        help="Tool schemas as JSON array (for tool-calling models). Pass as a JSON string.")
212    parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling (default: 42)")
213    return parser
214 
215 
216if __name__ == "__main__":
217    parser = build_parser()
218    if len(sys.argv) == 1:
219        parser.print_help()
220        sys.exit(0)
221 
222    args = parser.parse_args()
223    random.seed(args.seed)
224 
225    client, method = get_clients(base_url=args.base_url, azure_endpoint=args.endpoint, project_endpoint=args.project_endpoint, api_key=args.api_key)
226 
227    # Load data
228    with open(args.data, encoding="utf-8") as f:
229        data = []
230        for ln, line in enumerate(f, 1):
231            if not line.strip():
232                continue
233            try:
234                data.append(json.loads(line))
235            except json.JSONDecodeError as e:
236                print(f"⚠️ Skipping malformed JSON on line {ln}: {e}")
237    print(f"Loaded {len(data)} examples from {args.data}")
238 
239    # Load grader
240    grade_fn = load_grader(args.grader)
241    print(f"Loaded grader from {args.grader}")
242 
243    # Parse tools if provided
244    tools_schema = None
245    if args.tools:
246        tools_schema = json.loads(args.tools)
247 
248    calibrate(client, args.model, data, grade_fn, tools_schema, args.n)
249
Preparing the source view

Microsoft Foundry Skill

finetuning/scripts/calibrate_grader.py