Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/validate/validate_rft.py
1#!/usr/bin/env python32"""Validate RFT (Reinforcement Fine-Tuning) JSONL files for Azure AI Foundry.34Adapted from foundry-ft agent with critical additions from our platform gotchas:5- Grader escaping warnings for newlines (\\n must be \\\\n in JSON strings)6- Content moderation risk detection ("chain of thought" triggers RAI filter)7- Reference answer diversity check8"""9import argparse10import json11import sys1213try:14sys.stdout.reconfigure(encoding="utf-8")15sys.stderr.reconfigure(encoding="utf-8")16except (AttributeError, OSError):17pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine18from collections import Counter192021RISKY_PHRASES = [22"chain of thought", "step by step reasoning", "let me think",23"think carefully", "reason through",24]252627def validate_rft(filepath, expected_field=None):28errors = []29warnings = []30total = 031extra_fields_per_line: list[set[str]] = []32all_extra_field_counts: Counter = Counter()33grader_values: list[str] = []3435with open(filepath, "r", encoding="utf-8") as f:36for line_num, line in enumerate(f, 1):37raw_line = line38line = line.strip()39if not line:40continue41total += 14243try:44record = json.loads(line)45except json.JSONDecodeError as e:46errors.append(f"Line {line_num}: Invalid JSON — {e}")47continue4849if "messages" not in record:50errors.append(f"Line {line_num}: Missing 'messages' field")51else:52msgs = record["messages"]53if not isinstance(msgs, list) or len(msgs) == 0:54errors.append(f"Line {line_num}: 'messages' must be a non-empty array")55elif not any(m.get("role") == "user" for m in msgs):56errors.append(f"Line {line_num}: 'messages' has no 'user' message")57elif msgs[-1].get("role") != "user":58errors.append(59f"Line {line_num}: Last message must be 'user' role for RFT "60f"(found '{msgs[-1].get('role')}') — unlike SFT, the model generates its own response"61)6263# Detect extra fields (grader fields) beyond 'messages'64extra_fields = set(record.keys()) - {"messages"}65extra_fields_per_line.append(extra_fields)66all_extra_field_counts.update(extra_fields)6768if expected_field:69if expected_field not in record:70errors.append(f"Line {line_num}: Missing expected field '{expected_field}'")71else:72val = str(record[expected_field]).strip()73if not val:74errors.append(f"Line {line_num}: '{expected_field}' is empty")75else:76grader_values.append(val)77else:78if not extra_fields:79errors.append(80f"Line {line_num}: No grader fields found — RFT requires at least "81"one field beyond 'messages' (e.g. 'answer', 'reference_code')"82)83else:84# Collect values from extra fields for diversity check85for field in sorted(extra_fields):86val = str(record[field]).strip()87if val:88grader_values.append(val)8990# Check for unescaped newlines in extra fields (CRITICAL platform gotcha)91# Instead of regex-parsing the raw JSON line (which risks catastrophic92# backtracking), we compare the parsed value against the raw line to93# detect single-escaped \n that should be double-escaped \\n.94for field in extra_fields:95parsed_val = str(record.get(field, ""))96if "\n" in parsed_val:97# The parsed value contains actual newlines — check if the raw98# JSON has them properly double-escaped99field_needle = f'"{field}"'100if field_needle in raw_line:101field_start = raw_line.index(field_needle)102field_region = raw_line[field_start:field_start + 500]103# Single-escaped \n in raw JSON (not \\n) means the source104# code newlines aren't properly escaped for the platform105if "\\n" in field_region and "\\\\n" not in field_region:106warnings.append(107f"Line {line_num}: '{field}' contains \\n sequences — "108"if this is grader source code embedded in JSON, "109"ensure newlines are escaped as \\\\n."110)111112# Content moderation risk113all_text = json.dumps(record).lower()114for phrase in RISKY_PHRASES:115if phrase in all_text:116warnings.append(117f"Line {line_num}: Contains '{phrase}' — may trigger Azure content moderation filter."118)119break120121# Check for inconsistent extra-field schemas across examples122field_sets = [fs for fs in extra_fields_per_line if fs]123if len(field_sets) > 1:124first_schema = field_sets[0]125inconsistent_lines = [126i + 1 for i, fs in enumerate(extra_fields_per_line)127if fs and fs != first_schema128]129if inconsistent_lines:130warnings.append(131f"Inconsistent grader fields across examples — "132f"line 1 has {sorted(first_schema)}, but {len(inconsistent_lines)} "133f"line(s) differ (e.g. line {inconsistent_lines[0]}). "134"Ensure your grader handles all field variants."135)136137# Diversity check138if grader_values:139unique_values = set(grader_values)140if len(unique_values) == 1:141warnings.append(142f"All grader field values are identical ('{list(unique_values)[0][:50]}...') — "143"grader may not learn effectively"144)145avg_len = sum(len(v) for v in grader_values) / len(grader_values)146if avg_len > 500:147warnings.append(148f"Average grader field value length is {avg_len:.0f} chars — "149"consider using a model_grader instead of string_check"150)151152print(f"\n{'='*60}")153print(f"RFT Validation Report: {filepath}")154print(f"{'='*60}")155print(f"Total records: {total}")156print(f"Errors: {len(errors)}")157print(f"Warnings: {len(warnings)}")158159if all_extra_field_counts:160print(f"\nGrader fields found:")161for field, count in all_extra_field_counts.most_common():162print(f" • '{field}' — in {count}/{total} records")163164if errors:165print(f"\n❌ ERRORS (must fix):")166for e in errors[:20]:167print(f" • {e}")168if len(errors) > 20:169print(f" ... and {len(errors) - 20} more errors")170171if warnings:172print(f"\n⚠️ WARNINGS:")173for w in warnings[:10]:174print(f" • {w}")175if len(warnings) > 10:176print(f" ... and {len(warnings) - 10} more warnings")177178# RFT-specific guidance179if total > 0:180print(f"\n💡 RFT tips:")181print(f" • Ensure your training grader matches your eval grader (alignment gotcha)")182print(f" • Start with reasoning_effort='medium', pass_threshold=0.5")183print(f" • RFT is primarily for o-series models (o4-mini). Check Azure docs for the latest supported model list.")184185if not errors:186print(f"\n✅ Data is valid for RFT fine-tuning!")187else:188print(f"\n❌ Fix {len(errors)} error(s) before submitting.")189sys.exit(1)190191192if __name__ == "__main__":193parser = argparse.ArgumentParser(194description="Validate RFT (Reinforcement Fine-Tuning) JSONL files for Azure AI Foundry."195)196parser.add_argument("filepath", help="Path to the JSONL file to validate")197parser.add_argument(198"--expected-field",199default=None,200help="Specific grader field name to require (e.g. 'answer'). "201"If omitted, any extra field beyond 'messages' is accepted.",202)203args = parser.parse_args()204validate_rft(args.filepath, expected_field=args.expected_field)205