Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/score_dataset.py
1# /// script2# dependencies = [3# "openai>=1.0",4# "azure-identity",5# ]6# ///7"""8score_dataset.py — Assess training data quality using an LLM judge.910Scores each example on correctness and relevance, optionally filters11out low-quality examples.1213Usage:14# Score all examples15python score_dataset.py --input training.jsonl --output scored.jsonl1617# Score and filter (keep only score >= 7)18python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 71920# Custom scoring dimensions21python score_dataset.py --input training.jsonl --output scored.jsonl \22--dimensions "correctness,clarity,completeness"23"""2425import json26import os27import re28import sys2930try:31sys.stdout.reconfigure(encoding="utf-8")32sys.stderr.reconfigure(encoding="utf-8")33except (AttributeError, OSError):34pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine35import time36from concurrent.futures import ThreadPoolExecutor, as_completed37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))38from common import HelpOnErrorParser, get_clients, _clamp_score394041QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.4243## Task44Evaluate this training example for quality.4546## User input (what the model receives)47{user_content}4849## Assistant output (what the model should learn to produce)50{assistant_content}5152## Scoring dimensions53{dimensions_text}5455Rate each dimension on a scale of 1-10.5657Return ONLY a JSON object with dimension names as keys and integer scores as values.58Example: {example_json}"""596061DEFAULT_DIMENSIONS = {62"correctness": "Is the assistant's output factually/functionally correct?",63"relevance": "Does the output directly address the user's request?",64"quality": "Is the output well-written, well-formatted, and professional?",65}666768def score_example(client, model, user_content, assistant_content, dimensions):69"""Score a single training example."""70dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())71example = {k: 8 for k in dimensions}7273prompt = QUALITY_PROMPT.format(74user_content=user_content[:2000],75assistant_content=assistant_content[:2000],76dimensions_text=dims_text,77example_json=json.dumps(example),78)7980for attempt in range(3):81try:82resp = client.chat.completions.create(83model=model,84messages=[{"role": "user", "content": prompt}],85temperature=0.0,86max_completion_tokens=200,87)88text = (resp.choices[0].message.content or "").strip()89match = re.search(r'\{[^}]+\}', text)90if match:91scores = json.loads(match.group())92return {k: _clamp_score(scores.get(k)) for k in dimensions}93except Exception:94if attempt < 2:95time.sleep(2)9697return {k: 0 for k in dimensions}9899100def main():101parser = HelpOnErrorParser(description="Score training data quality with LLM judge")102parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),103help="Project /v1/ URL (preferred)")104parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),105help="Azure OpenAI endpoint (fallback)")106parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),107help="Azure AI project endpoint (Foundry SDK)")108parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))109parser.add_argument("--model", default="gpt-4o", help="Judge model")110parser.add_argument("--input", required=True, help="Input JSONL file")111parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")112parser.add_argument("--min-score", type=float, default=None,113help="Minimum average score to keep (filters below this)")114parser.add_argument("--dimensions", default=None,115help="Comma-separated dimension names (default: correctness,relevance,quality)")116parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")117parser.add_argument("--strip-metadata", action="store_true",118help="Remove _quality_scores and _avg_quality from output (safe for training input)")119args = parser.parse_args()120121client, method = get_clients(122base_url=args.base_url, azure_endpoint=args.endpoint,123project_endpoint=args.project_endpoint, api_key=args.api_key124)125126# Parse dimensions127if args.dimensions:128dim_names = [d.strip() for d in args.dimensions.split(",")]129dimensions = {d: f"Rate the {d} of the output" for d in dim_names}130else:131dimensions = DEFAULT_DIMENSIONS132133# Load data134examples = []135with open(args.input, encoding="utf-8") as f:136for i, line in enumerate(f):137if not line.strip():138continue139try:140ex = json.loads(line)141except json.JSONDecodeError as e:142print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")143continue144msgs = ex.get("messages", [])145user = next((m["content"] for m in msgs if m["role"] == "user"), "")146asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")147examples.append({"data": ex, "user": user, "assistant": asst})148149print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")150151# Score in parallel152def score_one(idx):153ex = examples[idx]154scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)155return idx, scores156157with ThreadPoolExecutor(max_workers=args.concurrency) as pool:158futures = {pool.submit(score_one, i): i for i in range(len(examples))}159done = 0160for future in as_completed(futures):161idx, scores = future.result()162examples[idx]["scores"] = scores163done += 1164if done % 25 == 0:165print(f" Scored {done}/{len(examples)}")166167# Calculate stats168all_avgs = []169for ex in examples:170scores = ex.get("scores", {})171if scores and any(v > 0 for v in scores.values()):172avg = sum(scores.values()) / len(scores)173ex["avg_score"] = avg174all_avgs.append(avg)175176if all_avgs:177print(f"\nQuality Distribution:")178print(f" Mean: {sum(all_avgs)/len(all_avgs):.1f}")179print(f" Min: {min(all_avgs):.1f}")180print(f" Max: {max(all_avgs):.1f}")181sorted_avgs = sorted(all_avgs)182n_avgs = len(sorted_avgs)183if n_avgs % 2 == 1:184median = sorted_avgs[n_avgs // 2]185else:186median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2187print(f" Median: {median:.1f}")188189# Filter and write190kept = 0191filtered = 0192with open(args.output, "w", encoding="utf-8") as f:193for ex in examples:194if not args.strip_metadata:195ex["data"]["_quality_scores"] = ex.get("scores", {})196ex["data"]["_avg_quality"] = ex.get("avg_score", 0)197198if args.min_score and ex.get("avg_score", 0) < args.min_score:199filtered += 1200continue201202f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")203kept += 1204205print(f"\nKept: {kept}, Filtered: {filtered}")206if args.min_score:207print(f"(min_score threshold: {args.min_score})")208if args.strip_metadata:209print("(metadata stripped — output is safe for training input)")210print(f"Output: {args.output}")211212213if __name__ == "__main__":214main()215