Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/score_dataset.py
1# /// script2# dependencies = [3# "openai>=1.0",4# "azure-identity",5# ]6# ///7"""8score_dataset.py — Assess training data quality using an LLM judge.910Scores each example on correctness and relevance, optionally filters11out low-quality examples.1213Usage:14# Score all examples15python score_dataset.py --input training.jsonl --output scored.jsonl1617# Score and filter (keep only score >= 7)18python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 71920# Custom scoring dimensions21python score_dataset.py --input training.jsonl --output scored.jsonl \22--dimensions "correctness,clarity,completeness"23"""2425import json26import os27import re28import sys2930try:31sys.stdout.reconfigure(encoding="utf-8")32sys.stderr.reconfigure(encoding="utf-8")33except (AttributeError, OSError):34pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine35import time36from concurrent.futures import ThreadPoolExecutor, as_completed37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))38from common import HelpOnErrorParser, get_clients, _clamp_score394041QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.4243## Task44Evaluate this training example for quality.4546## User input (what the model receives)47{user_content}4849## Assistant output (what the model should learn to produce)50{assistant_content}5152## Scoring dimensions53{dimensions_text}5455Rate each dimension on a scale of 1-10.5657Return ONLY a JSON object with dimension names as keys and integer scores as values.58Example: {example_json}"""596061DEFAULT_DIMENSIONS = {62"correctness": "Is the assistant's output factually/functionally correct?",63"relevance": "Does the output directly address the user's request?",64"quality": "Is the output well-written, well-formatted, and professional?",65}666768def score_example(client, model, user_content, assistant_content, dimensions):69"""Score a single training example."""70dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())71example = {k: 8 for k in dimensions}7273prompt = QUALITY_PROMPT.format(74user_content=user_content[:2000],75assistant_content=assistant_content[:2000],76dimensions_text=dims_text,77example_json=json.dumps(example),78)7980for attempt in range(3):81try:82resp = client.chat.completions.create(83model=model,84messages=[{"role": "user", "content": prompt}],85temperature=0.0,86max_completion_tokens=200,87)88text = (resp.choices[0].message.content or "").strip()89match = re.search(r'\{[^}]+\}', text)90if match:91scores = json.loads(match.group())92return {k: _clamp_score(scores.get(k)) for k in dimensions}93except Exception:94if attempt < 2:95time.sleep(2)9697return {k: 0 for k in dimensions}9899100def main():101parser = HelpOnErrorParser(description="Score training data quality with LLM judge")102parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),103help="Project /v1/ URL (preferred)")104parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),105help="Azure OpenAI endpoint (fallback)")106parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),107help="Azure AI project endpoint (Foundry SDK)")108parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))109parser.add_argument("--model", default="gpt-4o", help="Judge model")110parser.add_argument("--input", required=True, help="Input JSONL file")111parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")112parser.add_argument("--min-score", type=float, default=None,113help="Minimum average score to keep (filters below this)")114parser.add_argument("--dimensions", default=None,115help="Comma-separated dimension names (default: correctness,relevance,quality)")116parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")117parser.add_argument("--strip-metadata", action="store_true",118help="Remove _quality_scores and _avg_quality from output (safe for training input)")119args = parser.parse_args()120121client, method = get_clients(122base_url=args.base_url, azure_endpoint=args.endpoint,123project_endpoint=args.project_endpoint, api_key=args.api_key124)125126# Parse dimensions127if args.dimensions:128dim_names = [d.strip() for d in args.dimensions.split(",")]129dimensions = {d: f"Rate the {d} of the output" for d in dim_names}130else:131dimensions = DEFAULT_DIMENSIONS132133# Load data134examples = []135with open(args.input, encoding="utf-8") as f:136for i, line in enumerate(f):137if not line.strip():138continue139try:140ex = json.loads(line)141except json.JSONDecodeError as e:142print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")143continue144msgs = ex.get("messages", [])145user = next((m["content"] for m in msgs if m["role"] == "user"), "")146asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")147examples.append({"data": ex, "user": user, "assistant": asst})148149print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")150151# Score in parallel152def score_one(idx):153ex = examples[idx]154scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)155return idx, scores156157with ThreadPoolExecutor(max_workers=args.concurrency) as pool:158futures = {pool.submit(score_one, i): i for i in range(len(examples))}159done = 0160for future in as_completed(futures):161idx, scores = future.result()162examples[idx]["scores"] = scores163done += 1164if done % 25 == 0:165print(f" Scored {done}/{len(examples)}")166167# Calculate stats168all_avgs = []169for ex in examples:170scores = ex.get("scores", {})171if scores and any(v > 0 for v in scores.values()):172avg = sum(scores.values()) / len(scores)173ex["avg_score"] = avg174all_avgs.append(avg)175176if all_avgs:177print(f"\nQuality Distribution:")178print(f" Mean: {sum(all_avgs)/len(all_avgs):.1f}")179print(f" Min: {min(all_avgs):.1f}")180print(f" Max: {max(all_avgs):.1f}")181sorted_avgs = sorted(all_avgs)182n_avgs = len(sorted_avgs)183if n_avgs % 2 == 1:184median = sorted_avgs[n_avgs // 2]185else:186median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2187print(f" Median: {median:.1f}")188189# Filter and write190kept = 0191filtered = 0192with open(args.output, "w", encoding="utf-8") as f:193for ex in examples:194if not args.strip_metadata:195ex["data"]["_quality_scores"] = ex.get("scores", {})196ex["data"]["_avg_quality"] = ex.get("avg_score", 0)197198if args.min_score and ex.get("avg_score", 0) < args.min_score:199filtered += 1200continue201202f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")203kept += 1204205print(f"\nKept: {kept}, Filtered: {filtered}")206if args.min_score:207print(f"(min_score threshold: {args.min_score})")208if args.strip_metadata:209print("(metadata stripped — output is safe for training input)")210print(f"Output: {args.output}")211212213if __name__ == "__main__":214main()215