Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/evaluate_model.py
1# /// script2# dependencies = [3# "openai>=1.0",4# "azure-identity",5# ]6# ///7"""8evaluate_model.py — Custom 2-dimension LLM judge evaluator for fine-tuned models.910This is a lightweight evaluation script using the OpenAI API directly.11For production evaluation, prefer the Azure AI Evaluation SDK which provides12built-in graders, batch evaluation, and guardrail metrics. See13references/evaluation.md for SDK patterns.1415Uses the OpenAI API directly to:161. Generate responses from a deployed fine-tuned model172. Grade each response on correctness and conciseness using an LLM judge183. Produce aggregate quality scores (weighted 70% correctness, 30% conciseness)1920By default, system prompts from each test example's messages array are used21during generation. The --system-prompt flag overrides this for all examples.2223Usage:24python evaluate_model.py \25--deployment-name my-ft-eval \26--test-file test.jsonl \27--judge-model gpt-4o \28--output results.json2930python evaluate_model.py \31--base-url "$BASE_URL" --api-key "$API_KEY" \32--deployment-name my-ft-eval \33--test-file test.jsonl \34--concurrency 435"""3637import json38import os39import re40import sys4142try:43sys.stdout.reconfigure(encoding="utf-8")44sys.stderr.reconfigure(encoding="utf-8")45except (AttributeError, OSError):46pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine47import time48from concurrent.futures import ThreadPoolExecutor, as_completed49sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))50from common import HelpOnErrorParser, get_clients, _clamp_score515253JUDGE_PROMPT = """You are evaluating the quality of a model's output for a given task.5455## Task prompt56{prompt}5758## Reference answer59{reference}6061## Model output62{output}6364## Scoring6566Rate the output on two dimensions, each on a scale of 1-10:6768**Correctness** (1-10): Does the output correctly accomplish the task?69- 1-3: Fundamentally wrong or broken70- 4-6: Partially correct with significant issues71- 7-8: Mostly correct with minor issues72- 9-10: Fully correct7374**Conciseness** (1-10): Is the output appropriately concise?75- 1-3: Extremely verbose or padded76- 4-6: Contains unnecessary content77- 7-8: Mostly concise with minor excess78- 9-10: Clean and focused7980Return ONLY a JSON object: {{"correctness": <int>, "conciseness": <int>}}"""818283def load_test_data(filepath):84"""Load held-out test set. Expects JSONL with 'messages' array.8586Extracts the system prompt (if present), user prompt, and assistant87reference from each example so per-example system prompts are preserved.88"""89data = []90with open(filepath, encoding="utf-8") as f:91for i, line in enumerate(f):92if not line.strip():93continue94try:95ex = json.loads(line)96except json.JSONDecodeError as e:97print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")98continue99msgs = ex.get("messages")100if not isinstance(msgs, list):101print(f"⚠️ Skipping example {i}: missing or invalid 'messages' list")102continue103prompt = next((m["content"] for m in msgs if m["role"] == "user"), None)104reference = next((m["content"] for m in msgs if m["role"] == "assistant"), None)105if not prompt:106print(f"⚠️ Skipping example {i}: missing 'user' message")107continue108if not reference:109print(f"⚠️ Skipping example {i}: missing 'assistant' message")110continue111system_msgs = [m["content"] for m in msgs if m["role"] == "system"]112system_prompt = system_msgs[0] if system_msgs else None113data.append({"prompt": prompt, "reference": reference, "system_prompt": system_prompt})114return data115116117def generate_response(client, deployment, prompt, system_prompt=None, max_retries=3):118"""Generate a single response from the deployed model."""119messages = []120if system_prompt:121messages.append({"role": "system", "content": system_prompt})122messages.append({"role": "user", "content": prompt})123124for attempt in range(max_retries):125try:126resp = client.chat.completions.create(127model=deployment,128messages=messages,129temperature=0.0,130max_completion_tokens=2048,131)132content = resp.choices[0].message.content133if content is None:134# Content filter or empty completion — surface as an error sentinel135# so the aggregate filter at line ~`.startswith("ERROR:")` skips it.136finish = getattr(resp.choices[0], "finish_reason", "unknown")137return f"ERROR: empty content (finish_reason={finish})"138return content139except Exception as e:140if attempt >= max_retries - 1:141return f"ERROR: {e}"142time.sleep(3 * (attempt + 1))143return "ERROR: max retries exceeded"144145146def grade_response(judge_client, judge_model, prompt, reference, output, max_retries=3):147"""Grade a response using the LLM judge."""148judge_input = JUDGE_PROMPT.format(prompt=prompt, reference=reference, output=output)149150for attempt in range(max_retries):151try:152resp = judge_client.chat.completions.create(153model=judge_model,154messages=[{"role": "user", "content": judge_input}],155temperature=0.0,156max_completion_tokens=200,157)158text = (resp.choices[0].message.content or "").strip()159# Extract JSON from response160match = re.search(r'\{[^}]+\}', text)161if match:162scores = json.loads(match.group())163return {164"correctness": _clamp_score(scores.get("correctness")),165"conciseness": _clamp_score(scores.get("conciseness")),166}167except Exception as e:168if attempt < max_retries - 1:169time.sleep(2)170else:171return {"correctness": 0, "conciseness": 0, "error": str(e)}172173return {"correctness": 0, "conciseness": 0, "error": "All retries failed"}174175176def main():177parser = HelpOnErrorParser(description="Evaluate a fine-tuned model with LLM judge")178parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),179help="Project /v1/ URL (preferred)")180parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),181help="Azure OpenAI endpoint (fallback)")182parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),183help="Azure AI project endpoint (Foundry SDK)")184parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))185parser.add_argument("--deployment-name", required=True, help="Deployed model name")186parser.add_argument("--test-file", required=True, help="Held-out test set (JSONL)")187parser.add_argument("--system-prompt", default=None,188help="Override system prompt for all examples (default: use per-example system prompt from test data)")189190# Judge config191parser.add_argument("--judge-model", default="gpt-4o", help="Model for LLM judge")192parser.add_argument("--judge-endpoint", help="Endpoint for judge (default: same as model)")193parser.add_argument("--judge-api-key", help="API key for judge (default: same as model)")194195# Output196parser.add_argument("--output", default="eval_results.json", help="Output file")197parser.add_argument("--concurrency", type=int, default=1,198help="Parallel grading workers (generation is always sequential)")199200args = parser.parse_args()201202# Set up model client via shared auth (supports /v1/, Foundry SDK, AzureOpenAI)203model_client, method = get_clients(204base_url=args.base_url, azure_endpoint=args.endpoint,205project_endpoint=args.project_endpoint, api_key=args.api_key206)207208# Set up judge client (defaults to same connection as model)209judge_key = args.judge_api_key or args.api_key210if args.judge_endpoint:211judge_client, _ = get_clients(azure_endpoint=args.judge_endpoint, api_key=judge_key)212elif args.judge_api_key:213# Different API key but same endpoint — create a new client with the judge key214judge_client, _ = get_clients(215base_url=args.base_url, azure_endpoint=args.endpoint,216project_endpoint=args.project_endpoint, api_key=judge_key217)218else:219judge_client = model_client220221# Load data222test_data = load_test_data(args.test_file)223print(f"Loaded {len(test_data)} test examples from {args.test_file}")224225# Phase 1: Generate responses (sequential to avoid rate limits)226print(f"\nGenerating responses from {args.deployment_name}...")227for i, ex in enumerate(test_data):228# Use CLI override if provided, otherwise use per-example system prompt229effective_system_prompt = args.system_prompt if args.system_prompt is not None else ex.get("system_prompt")230ex["output"] = generate_response(231model_client, args.deployment_name, ex["prompt"], effective_system_prompt232)233if (i + 1) % 10 == 0:234print(f" Generated {i+1}/{len(test_data)}")235236errors = sum(1 for ex in test_data if ex["output"].startswith("ERROR:"))237print(f" Done. {errors} errors out of {len(test_data)}.")238239# Phase 2: Grade responses (parallel)240print(f"\nGrading with {args.judge_model} (concurrency={args.concurrency})...")241242def grade_one(ex):243return grade_response(judge_client, args.judge_model,244ex["prompt"], ex["reference"], ex["output"])245246with ThreadPoolExecutor(max_workers=args.concurrency) as pool:247futures = {pool.submit(grade_one, ex): i for i, ex in enumerate(test_data)}248for future in as_completed(futures):249idx = futures[future]250test_data[idx]["scores"] = future.result()251252# Aggregate253valid_scores = [ex["scores"] for ex in test_data254if ex["scores"]["correctness"] > 0]255if not valid_scores:256print("No valid scores — all grading failed.")257sys.exit(1)258259avg_corr = sum(s["correctness"] for s in valid_scores) / len(valid_scores)260avg_conc = sum(s["conciseness"] for s in valid_scores) / len(valid_scores)261combined = 0.7 * avg_corr + 0.3 * avg_conc262263print(f"\n{'='*50}")264print(f"Results for {args.deployment_name}")265print(f" Correctness: {avg_corr:.2f}")266print(f" Conciseness: {avg_conc:.2f}")267print(f" Combined: {combined:.2f}")268print(f" (N={len(valid_scores)} scored, {len(test_data)-len(valid_scores)} failed)")269print(f"{'='*50}")270271# Save272results = {273"deployment": args.deployment_name,274"judge_model": args.judge_model,275"n_examples": len(test_data),276"n_scored": len(valid_scores),277"correctness": round(avg_corr, 2),278"conciseness": round(avg_conc, 2),279"combined": round(combined, 2),280"details": [281{282"prompt": ex["prompt"][:200],283"scores": ex.get("scores", {}),284}285for ex in test_data286],287}288289with open(args.output, "w", encoding="utf-8") as f:290json.dump(results, f, indent=2)291print(f"\nDetailed results saved to {args.output}")292293294if __name__ == "__main__":295main()296