Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/evaluate_model.py
1# /// script2# dependencies = [3# "openai>=1.0",4# "azure-identity",5# ]6# ///7"""8evaluate_model.py — Custom 2-dimension LLM judge evaluator for fine-tuned models.910This is a lightweight evaluation script using the OpenAI API directly.11For production evaluation, prefer the Azure AI Evaluation SDK which provides12built-in graders, batch evaluation, and guardrail metrics. See13references/evaluation.md for SDK patterns.1415Uses the OpenAI API directly to:161. Generate responses from a deployed fine-tuned model172. Grade each response on correctness and conciseness using an LLM judge183. Produce aggregate quality scores (weighted 70% correctness, 30% conciseness)1920By default, system prompts from each test example's messages array are used21during generation. The --system-prompt flag overrides this for all examples.2223Usage:24python evaluate_model.py \25--deployment-name my-ft-eval \26--test-file test.jsonl \27--judge-model gpt-4o \28--output results.json2930python evaluate_model.py \31--base-url "$BASE_URL" --api-key "$API_KEY" \32--deployment-name my-ft-eval \33--test-file test.jsonl \34--concurrency 435"""3637import json38import os39import re40import sys4142try:43sys.stdout.reconfigure(encoding="utf-8")44sys.stderr.reconfigure(encoding="utf-8")45except (AttributeError, OSError):46pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine47import time48from concurrent.futures import ThreadPoolExecutor, as_completed49sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))50from common import HelpOnErrorParser, get_clients, _clamp_score515253JUDGE_PROMPT = """You are evaluating the quality of a model's output for a given task.5455## Task prompt56{prompt}5758## Reference answer59{reference}6061## Model output62{output}6364## Scoring6566Rate the output on two dimensions, each on a scale of 1-10:6768**Correctness** (1-10): Does the output correctly accomplish the task?69- 1-3: Fundamentally wrong or broken70- 4-6: Partially correct with significant issues71- 7-8: Mostly correct with minor issues72- 9-10: Fully correct7374**Conciseness** (1-10): Is the output appropriately concise?75- 1-3: Extremely verbose or padded76- 4-6: Contains unnecessary content77- 7-8: Mostly concise with minor excess78- 9-10: Clean and focused7980Return ONLY a JSON object: {{"correctness": <int>, "conciseness": <int>}}"""818283def load_test_data(filepath):84"""Load held-out test set. Expects JSONL with 'messages' array.8586Extracts the system prompt (if present), user prompt, and assistant87reference from each example so per-example system prompts are preserved.88"""89data = []90with open(filepath, encoding="utf-8") as f:91for i, line in enumerate(f):92if not line.strip():93continue94try:95ex = json.loads(line)96except json.JSONDecodeError as e:97print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")98continue99msgs = ex.get("messages")100if not isinstance(msgs, list):101print(f"⚠️ Skipping example {i}: missing or invalid 'messages' list")102continue103prompt = next((m["content"] for m in msgs if m["role"] == "user"), None)104reference = next((m["content"] for m in msgs if m["role"] == "assistant"), None)105if not prompt:106print(f"⚠️ Skipping example {i}: missing 'user' message")107continue108if not reference:109print(f"⚠️ Skipping example {i}: missing 'assistant' message")110continue111system_msgs = [m["content"] for m in msgs if m["role"] == "system"]112system_prompt = system_msgs[0] if system_msgs else None113data.append({"prompt": prompt, "reference": reference, "system_prompt": system_prompt})114return data115116117def generate_response(client, deployment, prompt, system_prompt=None, max_retries=3):118"""Generate a single response from the deployed model."""119messages = []120if system_prompt:121messages.append({"role": "system", "content": system_prompt})122messages.append({"role": "user", "content": prompt})123124for attempt in range(max_retries):125try:126resp = client.chat.completions.create(127model=deployment,128messages=messages,129temperature=0.0,130max_completion_tokens=2048,131)132content = resp.choices[0].message.content133if content is None:134# Content filter or empty completion — surface as an error sentinel135# so the aggregate filter at line ~`.startswith("ERROR:")` skips it.136finish = getattr(resp.choices[0], "finish_reason", "unknown")137return f"ERROR: empty content (finish_reason={finish})"138return content139except Exception as e:140if attempt >= max_retries - 1:141return f"ERROR: {e}"142time.sleep(3 * (attempt + 1))143return "ERROR: max retries exceeded"144145146def grade_response(judge_client, judge_model, prompt, reference, output, max_retries=3):147"""Grade a response using the LLM judge."""148judge_input = JUDGE_PROMPT.format(prompt=prompt, reference=reference, output=output)149150for attempt in range(max_retries):151try:152resp = judge_client.chat.completions.create(153model=judge_model,154messages=[{"role": "user", "content": judge_input}],155temperature=0.0,156max_completion_tokens=200,157)158text = (resp.choices[0].message.content or "").strip()159# Extract JSON from response160match = re.search(r'\{[^}]+\}', text)161if match:162scores = json.loads(match.group())163return {164"correctness": _clamp_score(scores.get("correctness")),165"conciseness": _clamp_score(scores.get("conciseness")),166}167except Exception as e:168if attempt < max_retries - 1:169time.sleep(2)170else:171return {"correctness": 0, "conciseness": 0, "error": str(e)}172173return {"correctness": 0, "conciseness": 0, "error": "All retries failed"}174175176def main():177parser = HelpOnErrorParser(description="Evaluate a fine-tuned model with LLM judge")178parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),179help="Project /v1/ URL (preferred)")180parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),181help="Azure OpenAI endpoint (fallback)")182parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),183help="Azure AI project endpoint (Foundry SDK)")184parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))185parser.add_argument("--deployment-name", required=True, help="Deployed model name")186parser.add_argument("--test-file", required=True, help="Held-out test set (JSONL)")187parser.add_argument("--system-prompt", default=None,188help="Override system prompt for all examples (default: use per-example system prompt from test data)")189190# Judge config191parser.add_argument("--judge-model", default="gpt-4o", help="Model for LLM judge")192parser.add_argument("--judge-endpoint", help="Endpoint for judge (default: same as model)")193parser.add_argument("--judge-api-key", help="API key for judge (default: same as model)")194195# Output196parser.add_argument("--output", default="eval_results.json", help="Output file")197parser.add_argument("--concurrency", type=int, default=1,198help="Parallel grading workers (generation is always sequential)")199200args = parser.parse_args()201202# Set up model client via shared auth (supports /v1/, Foundry SDK, AzureOpenAI)203model_client, method = get_clients(204base_url=args.base_url, azure_endpoint=args.endpoint,205project_endpoint=args.project_endpoint, api_key=args.api_key206)207208# Set up judge client (defaults to same connection as model)209judge_key = args.judge_api_key or args.api_key210if args.judge_endpoint:211judge_client, _ = get_clients(azure_endpoint=args.judge_endpoint, api_key=judge_key)212elif args.judge_api_key:213# Different API key but same endpoint — create a new client with the judge key214judge_client, _ = get_clients(215base_url=args.base_url, azure_endpoint=args.endpoint,216project_endpoint=args.project_endpoint, api_key=judge_key217)218else:219judge_client = model_client220221# Load data222test_data = load_test_data(args.test_file)223print(f"Loaded {len(test_data)} test examples from {args.test_file}")224225# Phase 1: Generate responses (sequential to avoid rate limits)226print(f"\nGenerating responses from {args.deployment_name}...")227for i, ex in enumerate(test_data):228# Use CLI override if provided, otherwise use per-example system prompt229effective_system_prompt = args.system_prompt if args.system_prompt is not None else ex.get("system_prompt")230ex["output"] = generate_response(231model_client, args.deployment_name, ex["prompt"], effective_system_prompt232)233if (i + 1) % 10 == 0:234print(f" Generated {i+1}/{len(test_data)}")235236errors = sum(1 for ex in test_data if ex["output"].startswith("ERROR:"))237print(f" Done. {errors} errors out of {len(test_data)}.")238239# Phase 2: Grade responses (parallel)240print(f"\nGrading with {args.judge_model} (concurrency={args.concurrency})...")241242def grade_one(ex):243return grade_response(judge_client, args.judge_model,244ex["prompt"], ex["reference"], ex["output"])245246with ThreadPoolExecutor(max_workers=args.concurrency) as pool:247futures = {pool.submit(grade_one, ex): i for i, ex in enumerate(test_data)}248for future in as_completed(futures):249idx = futures[future]250test_data[idx]["scores"] = future.result()251252# Aggregate253valid_scores = [ex["scores"] for ex in test_data254if ex["scores"]["correctness"] > 0]255if not valid_scores:256print("No valid scores — all grading failed.")257sys.exit(1)258259avg_corr = sum(s["correctness"] for s in valid_scores) / len(valid_scores)260avg_conc = sum(s["conciseness"] for s in valid_scores) / len(valid_scores)261combined = 0.7 * avg_corr + 0.3 * avg_conc262263print(f"\n{'='*50}")264print(f"Results for {args.deployment_name}")265print(f" Correctness: {avg_corr:.2f}")266print(f" Conciseness: {avg_conc:.2f}")267print(f" Combined: {combined:.2f}")268print(f" (N={len(valid_scores)} scored, {len(test_data)-len(valid_scores)} failed)")269print(f"{'='*50}")270271# Save272results = {273"deployment": args.deployment_name,274"judge_model": args.judge_model,275"n_examples": len(test_data),276"n_scored": len(valid_scores),277"correctness": round(avg_corr, 2),278"conciseness": round(avg_conc, 2),279"combined": round(combined, 2),280"details": [281{282"prompt": ex["prompt"][:200],283"scores": ex.get("scores", {}),284}285for ex in test_data286],287}288289with open(args.output, "w", encoding="utf-8") as f:290json.dump(results, f, indent=2)291print(f"\nDetailed results saved to {args.output}")292293294if __name__ == "__main__":295main()296