Source from repo
Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page
Files
144
Skill
n/a
Size
893.8 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
finetuning/scripts/evaluate_model.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code296 linesFree
finetuning/scripts/evaluate_model.py
1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8evaluate_model.py — Custom 2-dimension LLM judge evaluator for fine-tuned models.
9 
10This is a lightweight evaluation script using the OpenAI API directly.
11For production evaluation, prefer the Azure AI Evaluation SDK which provides
12built-in graders, batch evaluation, and guardrail metrics. See
13references/evaluation.md for SDK patterns.
14 
15Uses the OpenAI API directly to:
161. Generate responses from a deployed fine-tuned model
172. Grade each response on correctness and conciseness using an LLM judge
183. Produce aggregate quality scores (weighted 70% correctness, 30% conciseness)
19 
20By default, system prompts from each test example's messages array are used
21during generation. The --system-prompt flag overrides this for all examples.
22 
23Usage:
24  python evaluate_model.py \
25      --deployment-name my-ft-eval \
26      --test-file test.jsonl \
27      --judge-model gpt-4o \
28      --output results.json
29 
30  python evaluate_model.py \
31      --base-url "$BASE_URL" --api-key "$API_KEY" \
32      --deployment-name my-ft-eval \
33      --test-file test.jsonl \
34      --concurrency 4
35"""
36 
37import json
38import os
39import re
40import sys
41 
42try:
43    sys.stdout.reconfigure(encoding="utf-8")
44    sys.stderr.reconfigure(encoding="utf-8")
45except (AttributeError, OSError):
46    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
47import time
48from concurrent.futures import ThreadPoolExecutor, as_completed
49sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
50from common import HelpOnErrorParser, get_clients, _clamp_score
51 
52 
53JUDGE_PROMPT = """You are evaluating the quality of a model's output for a given task.
54 
55## Task prompt
56{prompt}
57 
58## Reference answer
59{reference}
60 
61## Model output
62{output}
63 
64## Scoring
65 
66Rate the output on two dimensions, each on a scale of 1-10:
67 
68**Correctness** (1-10): Does the output correctly accomplish the task?
69- 1-3: Fundamentally wrong or broken
70- 4-6: Partially correct with significant issues
71- 7-8: Mostly correct with minor issues
72- 9-10: Fully correct
73 
74**Conciseness** (1-10): Is the output appropriately concise?
75- 1-3: Extremely verbose or padded
76- 4-6: Contains unnecessary content
77- 7-8: Mostly concise with minor excess
78- 9-10: Clean and focused
79 
80Return ONLY a JSON object: {{"correctness": <int>, "conciseness": <int>}}"""
81 
82 
83def load_test_data(filepath):
84    """Load held-out test set. Expects JSONL with 'messages' array.
85 
86    Extracts the system prompt (if present), user prompt, and assistant
87    reference from each example so per-example system prompts are preserved.
88    """
89    data = []
90    with open(filepath, encoding="utf-8") as f:
91        for i, line in enumerate(f):
92            if not line.strip():
93                continue
94            try:
95                ex = json.loads(line)
96            except json.JSONDecodeError as e:
97                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
98                continue
99            msgs = ex.get("messages")
100            if not isinstance(msgs, list):
101                print(f"⚠️ Skipping example {i}: missing or invalid 'messages' list")
102                continue
103            prompt = next((m["content"] for m in msgs if m["role"] == "user"), None)
104            reference = next((m["content"] for m in msgs if m["role"] == "assistant"), None)
105            if not prompt:
106                print(f"⚠️ Skipping example {i}: missing 'user' message")
107                continue
108            if not reference:
109                print(f"⚠️ Skipping example {i}: missing 'assistant' message")
110                continue
111            system_msgs = [m["content"] for m in msgs if m["role"] == "system"]
112            system_prompt = system_msgs[0] if system_msgs else None
113            data.append({"prompt": prompt, "reference": reference, "system_prompt": system_prompt})
114    return data
115 
116 
117def generate_response(client, deployment, prompt, system_prompt=None, max_retries=3):
118    """Generate a single response from the deployed model."""
119    messages = []
120    if system_prompt:
121        messages.append({"role": "system", "content": system_prompt})
122    messages.append({"role": "user", "content": prompt})
123 
124    for attempt in range(max_retries):
125        try:
126            resp = client.chat.completions.create(
127                model=deployment,
128                messages=messages,
129                temperature=0.0,
130                max_completion_tokens=2048,
131            )
132            content = resp.choices[0].message.content
133            if content is None:
134                # Content filter or empty completion — surface as an error sentinel
135                # so the aggregate filter at line ~`.startswith("ERROR:")` skips it.
136                finish = getattr(resp.choices[0], "finish_reason", "unknown")
137                return f"ERROR: empty content (finish_reason={finish})"
138            return content
139        except Exception as e:
140            if attempt >= max_retries - 1:
141                return f"ERROR: {e}"
142            time.sleep(3 * (attempt + 1))
143    return "ERROR: max retries exceeded"
144 
145 
146def grade_response(judge_client, judge_model, prompt, reference, output, max_retries=3):
147    """Grade a response using the LLM judge."""
148    judge_input = JUDGE_PROMPT.format(prompt=prompt, reference=reference, output=output)
149 
150    for attempt in range(max_retries):
151        try:
152            resp = judge_client.chat.completions.create(
153                model=judge_model,
154                messages=[{"role": "user", "content": judge_input}],
155                temperature=0.0,
156                max_completion_tokens=200,
157            )
158            text = (resp.choices[0].message.content or "").strip()
159            # Extract JSON from response
160            match = re.search(r'\{[^}]+\}', text)
161            if match:
162                scores = json.loads(match.group())
163                return {
164                    "correctness": _clamp_score(scores.get("correctness")),
165                    "conciseness": _clamp_score(scores.get("conciseness")),
166                }
167        except Exception as e:
168            if attempt < max_retries - 1:
169                time.sleep(2)
170            else:
171                return {"correctness": 0, "conciseness": 0, "error": str(e)}
172 
173    return {"correctness": 0, "conciseness": 0, "error": "All retries failed"}
174 
175 
176def main():
177    parser = HelpOnErrorParser(description="Evaluate a fine-tuned model with LLM judge")
178    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
179                        help="Project /v1/ URL (preferred)")
180    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
181                        help="Azure OpenAI endpoint (fallback)")
182    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
183                        help="Azure AI project endpoint (Foundry SDK)")
184    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
185    parser.add_argument("--deployment-name", required=True, help="Deployed model name")
186    parser.add_argument("--test-file", required=True, help="Held-out test set (JSONL)")
187    parser.add_argument("--system-prompt", default=None,
188                        help="Override system prompt for all examples (default: use per-example system prompt from test data)")
189 
190    # Judge config
191    parser.add_argument("--judge-model", default="gpt-4o", help="Model for LLM judge")
192    parser.add_argument("--judge-endpoint", help="Endpoint for judge (default: same as model)")
193    parser.add_argument("--judge-api-key", help="API key for judge (default: same as model)")
194 
195    # Output
196    parser.add_argument("--output", default="eval_results.json", help="Output file")
197    parser.add_argument("--concurrency", type=int, default=1,
198                        help="Parallel grading workers (generation is always sequential)")
199 
200    args = parser.parse_args()
201 
202    # Set up model client via shared auth (supports /v1/, Foundry SDK, AzureOpenAI)
203    model_client, method = get_clients(
204        base_url=args.base_url, azure_endpoint=args.endpoint,
205        project_endpoint=args.project_endpoint, api_key=args.api_key
206    )
207 
208    # Set up judge client (defaults to same connection as model)
209    judge_key = args.judge_api_key or args.api_key
210    if args.judge_endpoint:
211        judge_client, _ = get_clients(azure_endpoint=args.judge_endpoint, api_key=judge_key)
212    elif args.judge_api_key:
213        # Different API key but same endpoint — create a new client with the judge key
214        judge_client, _ = get_clients(
215            base_url=args.base_url, azure_endpoint=args.endpoint,
216            project_endpoint=args.project_endpoint, api_key=judge_key
217        )
218    else:
219        judge_client = model_client
220 
221    # Load data
222    test_data = load_test_data(args.test_file)
223    print(f"Loaded {len(test_data)} test examples from {args.test_file}")
224 
225    # Phase 1: Generate responses (sequential to avoid rate limits)
226    print(f"\nGenerating responses from {args.deployment_name}...")
227    for i, ex in enumerate(test_data):
228        # Use CLI override if provided, otherwise use per-example system prompt
229        effective_system_prompt = args.system_prompt if args.system_prompt is not None else ex.get("system_prompt")
230        ex["output"] = generate_response(
231            model_client, args.deployment_name, ex["prompt"], effective_system_prompt
232        )
233        if (i + 1) % 10 == 0:
234            print(f"  Generated {i+1}/{len(test_data)}")
235 
236    errors = sum(1 for ex in test_data if ex["output"].startswith("ERROR:"))
237    print(f"  Done. {errors} errors out of {len(test_data)}.")
238 
239    # Phase 2: Grade responses (parallel)
240    print(f"\nGrading with {args.judge_model} (concurrency={args.concurrency})...")
241 
242    def grade_one(ex):
243        return grade_response(judge_client, args.judge_model,
244                              ex["prompt"], ex["reference"], ex["output"])
245 
246    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
247        futures = {pool.submit(grade_one, ex): i for i, ex in enumerate(test_data)}
248        for future in as_completed(futures):
249            idx = futures[future]
250            test_data[idx]["scores"] = future.result()
251 
252    # Aggregate
253    valid_scores = [ex["scores"] for ex in test_data
254                    if ex["scores"]["correctness"] > 0]
255    if not valid_scores:
256        print("No valid scores — all grading failed.")
257        sys.exit(1)
258 
259    avg_corr = sum(s["correctness"] for s in valid_scores) / len(valid_scores)
260    avg_conc = sum(s["conciseness"] for s in valid_scores) / len(valid_scores)
261    combined = 0.7 * avg_corr + 0.3 * avg_conc
262 
263    print(f"\n{'='*50}")
264    print(f"Results for {args.deployment_name}")
265    print(f"  Correctness:  {avg_corr:.2f}")
266    print(f"  Conciseness:  {avg_conc:.2f}")
267    print(f"  Combined:     {combined:.2f}")
268    print(f"  (N={len(valid_scores)} scored, {len(test_data)-len(valid_scores)} failed)")
269    print(f"{'='*50}")
270 
271    # Save
272    results = {
273        "deployment": args.deployment_name,
274        "judge_model": args.judge_model,
275        "n_examples": len(test_data),
276        "n_scored": len(valid_scores),
277        "correctness": round(avg_corr, 2),
278        "conciseness": round(avg_conc, 2),
279        "combined": round(combined, 2),
280        "details": [
281            {
282                "prompt": ex["prompt"][:200],
283                "scores": ex.get("scores", {}),
284            }
285            for ex in test_data
286        ],
287    }
288 
289    with open(args.output, "w", encoding="utf-8") as f:
290        json.dump(results, f, indent=2)
291    print(f"\nDetailed results saved to {args.output}")
292 
293 
294if __name__ == "__main__":
295    main()
296
Preparing the source view

Microsoft Foundry Skill

finetuning/scripts/evaluate_model.py