Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

145

Skill

n/a

Size

893.9 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/evaluate_model.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code296 linesFree

finetuning/scripts/evaluate_model.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8evaluate_model.py — Custom 2-dimension LLM judge evaluator for fine-tuned models.
9 
10This is a lightweight evaluation script using the OpenAI API directly.
11For production evaluation, prefer the Azure AI Evaluation SDK which provides
12built-in graders, batch evaluation, and guardrail metrics. See
13references/evaluation.md for SDK patterns.
14 
15Uses the OpenAI API directly to:
161. Generate responses from a deployed fine-tuned model
172. Grade each response on correctness and conciseness using an LLM judge
183. Produce aggregate quality scores (weighted 70% correctness, 30% conciseness)
19 
20By default, system prompts from each test example's messages array are used
21during generation. The --system-prompt flag overrides this for all examples.
22 
23Usage:
24  python evaluate_model.py \
25      --deployment-name my-ft-eval \
26      --test-file test.jsonl \
27      --judge-model gpt-4o \
28      --output results.json
29 
30  python evaluate_model.py \
31      --base-url "$BASE_URL" --api-key "$API_KEY" \
32      --deployment-name my-ft-eval \
33      --test-file test.jsonl \
34      --concurrency 4
35"""
36 
37import json
38import os
39import re
40import sys
41 
42try:
43    sys.stdout.reconfigure(encoding="utf-8")
44    sys.stderr.reconfigure(encoding="utf-8")
45except (AttributeError, OSError):
46    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
47import time
48from concurrent.futures import ThreadPoolExecutor, as_completed
49sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
50from common import HelpOnErrorParser, get_clients, _clamp_score
51 
52 
53JUDGE_PROMPT = """You are evaluating the quality of a model's output for a given task.
54 
55## Task prompt
56{prompt}
57 
58## Reference answer
59{reference}
60 
61## Model output
62{output}
63 
64## Scoring
65 
66Rate the output on two dimensions, each on a scale of 1-10:
67 
68**Correctness** (1-10): Does the output correctly accomplish the task?
69- 1-3: Fundamentally wrong or broken
70- 4-6: Partially correct with significant issues
71- 7-8: Mostly correct with minor issues
72- 9-10: Fully correct
73 
74**Conciseness** (1-10): Is the output appropriately concise?
75- 1-3: Extremely verbose or padded
76- 4-6: Contains unnecessary content
77- 7-8: Mostly concise with minor excess
78- 9-10: Clean and focused
79 
80Return ONLY a JSON object: {{"correctness": <int>, "conciseness": <int>}}"""
81 
82 
83def load_test_data(filepath):
84    """Load held-out test set. Expects JSONL with 'messages' array.
85 
86    Extracts the system prompt (if present), user prompt, and assistant
87    reference from each example so per-example system prompts are preserved.
88    """
89    data = []
90    with open(filepath, encoding="utf-8") as f:
91        for i, line in enumerate(f):
92            if not line.strip():
93                continue
94            try:
95                ex = json.loads(line)
96            except json.JSONDecodeError as e:
97                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
98                continue
99            msgs = ex.get("messages")
100            if not isinstance(msgs, list):
101                print(f"⚠️ Skipping example {i}: missing or invalid 'messages' list")
102                continue
103            prompt = next((m["content"] for m in msgs if m["role"] == "user"), None)
104            reference = next((m["content"] for m in msgs if m["role"] == "assistant"), None)
105            if not prompt:
106                print(f"⚠️ Skipping example {i}: missing 'user' message")
107                continue
108            if not reference:
109                print(f"⚠️ Skipping example {i}: missing 'assistant' message")
110                continue
111            system_msgs = [m["content"] for m in msgs if m["role"] == "system"]
112            system_prompt = system_msgs[0] if system_msgs else None
113            data.append({"prompt": prompt, "reference": reference, "system_prompt": system_prompt})
114    return data
115 
116 
117def generate_response(client, deployment, prompt, system_prompt=None, max_retries=3):
118    """Generate a single response from the deployed model."""
119    messages = []
120    if system_prompt:
121        messages.append({"role": "system", "content": system_prompt})
122    messages.append({"role": "user", "content": prompt})
123 
124    for attempt in range(max_retries):
125        try:
126            resp = client.chat.completions.create(
127                model=deployment,
128                messages=messages,
129                temperature=0.0,
130                max_completion_tokens=2048,
131            )
132            content = resp.choices[0].message.content
133            if content is None:
134                # Content filter or empty completion — surface as an error sentinel
135                # so the aggregate filter at line ~`.startswith("ERROR:")` skips it.
136                finish = getattr(resp.choices[0], "finish_reason", "unknown")
137                return f"ERROR: empty content (finish_reason={finish})"
138            return content
139        except Exception as e:
140            if attempt >= max_retries - 1:
141                return f"ERROR: {e}"
142            time.sleep(3 * (attempt + 1))
143    return "ERROR: max retries exceeded"
144 
145 
146def grade_response(judge_client, judge_model, prompt, reference, output, max_retries=3):
147    """Grade a response using the LLM judge."""
148    judge_input = JUDGE_PROMPT.format(prompt=prompt, reference=reference, output=output)
149 
150    for attempt in range(max_retries):
151        try:
152            resp = judge_client.chat.completions.create(
153                model=judge_model,
154                messages=[{"role": "user", "content": judge_input}],
155                temperature=0.0,
156                max_completion_tokens=200,
157            )
158            text = (resp.choices[0].message.content or "").strip()
159            # Extract JSON from response
160            match = re.search(r'\{[^}]+\}', text)
161            if match:
162                scores = json.loads(match.group())
163                return {
164                    "correctness": _clamp_score(scores.get("correctness")),
165                    "conciseness": _clamp_score(scores.get("conciseness")),
166                }
167        except Exception as e:
168            if attempt < max_retries - 1:
169                time.sleep(2)
170            else:
171                return {"correctness": 0, "conciseness": 0, "error": str(e)}
172 
173    return {"correctness": 0, "conciseness": 0, "error": "All retries failed"}
174 
175 
176def main():
177    parser = HelpOnErrorParser(description="Evaluate a fine-tuned model with LLM judge")
178    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
179                        help="Project /v1/ URL (preferred)")
180    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
181                        help="Azure OpenAI endpoint (fallback)")
182    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
183                        help="Azure AI project endpoint (Foundry SDK)")
184    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
185    parser.add_argument("--deployment-name", required=True, help="Deployed model name")
186    parser.add_argument("--test-file", required=True, help="Held-out test set (JSONL)")
187    parser.add_argument("--system-prompt", default=None,
188                        help="Override system prompt for all examples (default: use per-example system prompt from test data)")
189 
190    # Judge config
191    parser.add_argument("--judge-model", default="gpt-4o", help="Model for LLM judge")
192    parser.add_argument("--judge-endpoint", help="Endpoint for judge (default: same as model)")
193    parser.add_argument("--judge-api-key", help="API key for judge (default: same as model)")
194 
195    # Output
196    parser.add_argument("--output", default="eval_results.json", help="Output file")
197    parser.add_argument("--concurrency", type=int, default=1,
198                        help="Parallel grading workers (generation is always sequential)")
199 
200    args = parser.parse_args()
201 
202    # Set up model client via shared auth (supports /v1/, Foundry SDK, AzureOpenAI)
203    model_client, method = get_clients(
204        base_url=args.base_url, azure_endpoint=args.endpoint,
205        project_endpoint=args.project_endpoint, api_key=args.api_key
206    )
207 
208    # Set up judge client (defaults to same connection as model)
209    judge_key = args.judge_api_key or args.api_key
210    if args.judge_endpoint:
211        judge_client, _ = get_clients(azure_endpoint=args.judge_endpoint, api_key=judge_key)
212    elif args.judge_api_key:
213        # Different API key but same endpoint — create a new client with the judge key
214        judge_client, _ = get_clients(
215            base_url=args.base_url, azure_endpoint=args.endpoint,
216            project_endpoint=args.project_endpoint, api_key=judge_key
217        )
218    else:
219        judge_client = model_client
220 
221    # Load data
222    test_data = load_test_data(args.test_file)
223    print(f"Loaded {len(test_data)} test examples from {args.test_file}")
224 
225    # Phase 1: Generate responses (sequential to avoid rate limits)
226    print(f"\nGenerating responses from {args.deployment_name}...")
227    for i, ex in enumerate(test_data):
228        # Use CLI override if provided, otherwise use per-example system prompt
229        effective_system_prompt = args.system_prompt if args.system_prompt is not None else ex.get("system_prompt")
230        ex["output"] = generate_response(
231            model_client, args.deployment_name, ex["prompt"], effective_system_prompt
232        )
233        if (i + 1) % 10 == 0:
234            print(f"  Generated {i+1}/{len(test_data)}")
235 
236    errors = sum(1 for ex in test_data if ex["output"].startswith("ERROR:"))
237    print(f"  Done. {errors} errors out of {len(test_data)}.")
238 
239    # Phase 2: Grade responses (parallel)
240    print(f"\nGrading with {args.judge_model} (concurrency={args.concurrency})...")
241 
242    def grade_one(ex):
243        return grade_response(judge_client, args.judge_model,
244                              ex["prompt"], ex["reference"], ex["output"])
245 
246    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
247        futures = {pool.submit(grade_one, ex): i for i, ex in enumerate(test_data)}
248        for future in as_completed(futures):
249            idx = futures[future]
250            test_data[idx]["scores"] = future.result()
251 
252    # Aggregate
253    valid_scores = [ex["scores"] for ex in test_data
254                    if ex["scores"]["correctness"] > 0]
255    if not valid_scores:
256        print("No valid scores — all grading failed.")
257        sys.exit(1)
258 
259    avg_corr = sum(s["correctness"] for s in valid_scores) / len(valid_scores)
260    avg_conc = sum(s["conciseness"] for s in valid_scores) / len(valid_scores)
261    combined = 0.7 * avg_corr + 0.3 * avg_conc
262 
263    print(f"\n{'='*50}")
264    print(f"Results for {args.deployment_name}")
265    print(f"  Correctness:  {avg_corr:.2f}")
266    print(f"  Conciseness:  {avg_conc:.2f}")
267    print(f"  Combined:     {combined:.2f}")
268    print(f"  (N={len(valid_scores)} scored, {len(test_data)-len(valid_scores)} failed)")
269    print(f"{'='*50}")
270 
271    # Save
272    results = {
273        "deployment": args.deployment_name,
274        "judge_model": args.judge_model,
275        "n_examples": len(test_data),
276        "n_scored": len(valid_scores),
277        "correctness": round(avg_corr, 2),
278        "conciseness": round(avg_conc, 2),
279        "combined": round(combined, 2),
280        "details": [
281            {
282                "prompt": ex["prompt"][:200],
283                "scores": ex.get("scores", {}),
284            }
285            for ex in test_data
286        ],
287    }
288 
289    with open(args.output, "w", encoding="utf-8") as f:
290        json.dump(results, f, indent=2)
291    print(f"\nDetailed results saved to {args.output}")
292 
293 
294if __name__ == "__main__":
295    main()
296

Marketplace

Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

145

Skill

n/a

Size

893.9 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/evaluate_model.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code296 linesFree

finetuning/scripts/evaluate_model.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8evaluate_model.py — Custom 2-dimension LLM judge evaluator for fine-tuned models.
9 
10This is a lightweight evaluation script using the OpenAI API directly.
11For production evaluation, prefer the Azure AI Evaluation SDK which provides
12built-in graders, batch evaluation, and guardrail metrics. See
13references/evaluation.md for SDK patterns.
14 
15Uses the OpenAI API directly to:
161. Generate responses from a deployed fine-tuned model
172. Grade each response on correctness and conciseness using an LLM judge
183. Produce aggregate quality scores (weighted 70% correctness, 30% conciseness)
19 
20By default, system prompts from each test example's messages array are used
21during generation. The --system-prompt flag overrides this for all examples.
22 
23Usage:
24  python evaluate_model.py \
25      --deployment-name my-ft-eval \
26      --test-file test.jsonl \
27      --judge-model gpt-4o \
28      --output results.json
29 
30  python evaluate_model.py \
31      --base-url "$BASE_URL" --api-key "$API_KEY" \
32      --deployment-name my-ft-eval \
33      --test-file test.jsonl \
34      --concurrency 4
35"""
36 
37import json
38import os
39import re
40import sys
41 
42try:
43    sys.stdout.reconfigure(encoding="utf-8")
44    sys.stderr.reconfigure(encoding="utf-8")
45except (AttributeError, OSError):
46    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
47import time
48from concurrent.futures import ThreadPoolExecutor, as_completed
49sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
50from common import HelpOnErrorParser, get_clients, _clamp_score
51 
52 
53JUDGE_PROMPT = """You are evaluating the quality of a model's output for a given task.
54 
55## Task prompt
56{prompt}
57 
58## Reference answer
59{reference}
60 
61## Model output
62{output}
63 
64## Scoring
65 
66Rate the output on two dimensions, each on a scale of 1-10:
67 
68**Correctness** (1-10): Does the output correctly accomplish the task?
69- 1-3: Fundamentally wrong or broken
70- 4-6: Partially correct with significant issues
71- 7-8: Mostly correct with minor issues
72- 9-10: Fully correct
73 
74**Conciseness** (1-10): Is the output appropriately concise?
75- 1-3: Extremely verbose or padded
76- 4-6: Contains unnecessary content
77- 7-8: Mostly concise with minor excess
78- 9-10: Clean and focused
79 
80Return ONLY a JSON object: {{"correctness": <int>, "conciseness": <int>}}"""
81 
82 
83def load_test_data(filepath):
84    """Load held-out test set. Expects JSONL with 'messages' array.
85 
86    Extracts the system prompt (if present), user prompt, and assistant
87    reference from each example so per-example system prompts are preserved.
88    """
89    data = []
90    with open(filepath, encoding="utf-8") as f:
91        for i, line in enumerate(f):
92            if not line.strip():
93                continue
94            try:
95                ex = json.loads(line)
96            except json.JSONDecodeError as e:
97                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
98                continue
99            msgs = ex.get("messages")
100            if not isinstance(msgs, list):
101                print(f"⚠️ Skipping example {i}: missing or invalid 'messages' list")
102                continue
103            prompt = next((m["content"] for m in msgs if m["role"] == "user"), None)
104            reference = next((m["content"] for m in msgs if m["role"] == "assistant"), None)
105            if not prompt:
106                print(f"⚠️ Skipping example {i}: missing 'user' message")
107                continue
108            if not reference:
109                print(f"⚠️ Skipping example {i}: missing 'assistant' message")
110                continue
111            system_msgs = [m["content"] for m in msgs if m["role"] == "system"]
112            system_prompt = system_msgs[0] if system_msgs else None
113            data.append({"prompt": prompt, "reference": reference, "system_prompt": system_prompt})
114    return data
115 
116 
117def generate_response(client, deployment, prompt, system_prompt=None, max_retries=3):
118    """Generate a single response from the deployed model."""
119    messages = []
120    if system_prompt:
121        messages.append({"role": "system", "content": system_prompt})
122    messages.append({"role": "user", "content": prompt})
123 
124    for attempt in range(max_retries):
125        try:
126            resp = client.chat.completions.create(
127                model=deployment,
128                messages=messages,
129                temperature=0.0,
130                max_completion_tokens=2048,
131            )
132            content = resp.choices[0].message.content
133            if content is None:
134                # Content filter or empty completion — surface as an error sentinel
135                # so the aggregate filter at line ~`.startswith("ERROR:")` skips it.
136                finish = getattr(resp.choices[0], "finish_reason", "unknown")
137                return f"ERROR: empty content (finish_reason={finish})"
138            return content
139        except Exception as e:
140            if attempt >= max_retries - 1:
141                return f"ERROR: {e}"
142            time.sleep(3 * (attempt + 1))
143    return "ERROR: max retries exceeded"
144 
145 
146def grade_response(judge_client, judge_model, prompt, reference, output, max_retries=3):
147    """Grade a response using the LLM judge."""
148    judge_input = JUDGE_PROMPT.format(prompt=prompt, reference=reference, output=output)
149 
150    for attempt in range(max_retries):
151        try:
152            resp = judge_client.chat.completions.create(
153                model=judge_model,
154                messages=[{"role": "user", "content": judge_input}],
155                temperature=0.0,
156                max_completion_tokens=200,
157            )
158            text = (resp.choices[0].message.content or "").strip()
159            # Extract JSON from response
160            match = re.search(r'\{[^}]+\}', text)
161            if match:
162                scores = json.loads(match.group())
163                return {
164                    "correctness": _clamp_score(scores.get("correctness")),
165                    "conciseness": _clamp_score(scores.get("conciseness")),
166                }
167        except Exception as e:
168            if attempt < max_retries - 1:
169                time.sleep(2)
170            else:
171                return {"correctness": 0, "conciseness": 0, "error": str(e)}
172 
173    return {"correctness": 0, "conciseness": 0, "error": "All retries failed"}
174 
175 
176def main():
177    parser = HelpOnErrorParser(description="Evaluate a fine-tuned model with LLM judge")
178    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
179                        help="Project /v1/ URL (preferred)")
180    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
181                        help="Azure OpenAI endpoint (fallback)")
182    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
183                        help="Azure AI project endpoint (Foundry SDK)")
184    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
185    parser.add_argument("--deployment-name", required=True, help="Deployed model name")
186    parser.add_argument("--test-file", required=True, help="Held-out test set (JSONL)")
187    parser.add_argument("--system-prompt", default=None,
188                        help="Override system prompt for all examples (default: use per-example system prompt from test data)")
189 
190    # Judge config
191    parser.add_argument("--judge-model", default="gpt-4o", help="Model for LLM judge")
192    parser.add_argument("--judge-endpoint", help="Endpoint for judge (default: same as model)")
193    parser.add_argument("--judge-api-key", help="API key for judge (default: same as model)")
194 
195    # Output
196    parser.add_argument("--output", default="eval_results.json", help="Output file")
197    parser.add_argument("--concurrency", type=int, default=1,
198                        help="Parallel grading workers (generation is always sequential)")
199 
200    args = parser.parse_args()
201 
202    # Set up model client via shared auth (supports /v1/, Foundry SDK, AzureOpenAI)
203    model_client, method = get_clients(
204        base_url=args.base_url, azure_endpoint=args.endpoint,
205        project_endpoint=args.project_endpoint, api_key=args.api_key
206    )
207 
208    # Set up judge client (defaults to same connection as model)
209    judge_key = args.judge_api_key or args.api_key
210    if args.judge_endpoint:
211        judge_client, _ = get_clients(azure_endpoint=args.judge_endpoint, api_key=judge_key)
212    elif args.judge_api_key:
213        # Different API key but same endpoint — create a new client with the judge key
214        judge_client, _ = get_clients(
215            base_url=args.base_url, azure_endpoint=args.endpoint,
216            project_endpoint=args.project_endpoint, api_key=judge_key
217        )
218    else:
219        judge_client = model_client
220 
221    # Load data
222    test_data = load_test_data(args.test_file)
223    print(f"Loaded {len(test_data)} test examples from {args.test_file}")
224 
225    # Phase 1: Generate responses (sequential to avoid rate limits)
226    print(f"\nGenerating responses from {args.deployment_name}...")
227    for i, ex in enumerate(test_data):
228        # Use CLI override if provided, otherwise use per-example system prompt
229        effective_system_prompt = args.system_prompt if args.system_prompt is not None else ex.get("system_prompt")
230        ex["output"] = generate_response(
231            model_client, args.deployment_name, ex["prompt"], effective_system_prompt
232        )
233        if (i + 1) % 10 == 0:
234            print(f"  Generated {i+1}/{len(test_data)}")
235 
236    errors = sum(1 for ex in test_data if ex["output"].startswith("ERROR:"))
237    print(f"  Done. {errors} errors out of {len(test_data)}.")
238 
239    # Phase 2: Grade responses (parallel)
240    print(f"\nGrading with {args.judge_model} (concurrency={args.concurrency})...")
241 
242    def grade_one(ex):
243        return grade_response(judge_client, args.judge_model,
244                              ex["prompt"], ex["reference"], ex["output"])
245 
246    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
247        futures = {pool.submit(grade_one, ex): i for i, ex in enumerate(test_data)}
248        for future in as_completed(futures):
249            idx = futures[future]
250            test_data[idx]["scores"] = future.result()
251 
252    # Aggregate
253    valid_scores = [ex["scores"] for ex in test_data
254                    if ex["scores"]["correctness"] > 0]
255    if not valid_scores:
256        print("No valid scores — all grading failed.")
257        sys.exit(1)
258 
259    avg_corr = sum(s["correctness"] for s in valid_scores) / len(valid_scores)
260    avg_conc = sum(s["conciseness"] for s in valid_scores) / len(valid_scores)
261    combined = 0.7 * avg_corr + 0.3 * avg_conc
262 
263    print(f"\n{'='*50}")
264    print(f"Results for {args.deployment_name}")
265    print(f"  Correctness:  {avg_corr:.2f}")
266    print(f"  Conciseness:  {avg_conc:.2f}")
267    print(f"  Combined:     {combined:.2f}")
268    print(f"  (N={len(valid_scores)} scored, {len(test_data)-len(valid_scores)} failed)")
269    print(f"{'='*50}")
270 
271    # Save
272    results = {
273        "deployment": args.deployment_name,
274        "judge_model": args.judge_model,
275        "n_examples": len(test_data),
276        "n_scored": len(valid_scores),
277        "correctness": round(avg_corr, 2),
278        "conciseness": round(avg_conc, 2),
279        "combined": round(combined, 2),
280        "details": [
281            {
282                "prompt": ex["prompt"][:200],
283                "scores": ex.get("scores", {}),
284            }
285            for ex in test_data
286        ],
287    }
288 
289    with open(args.output, "w", encoding="utf-8") as f:
290        json.dump(results, f, indent=2)
291    print(f"\nDetailed results saved to {args.output}")
292 
293 
294if __name__ == "__main__":
295    main()
296

Microsoft Foundry Skill

finetuning/scripts/evaluate_model.py

Preparing the source view

Microsoft Foundry Skill

finetuning/scripts/evaluate_model.py