Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/validate/validate_sft.py
1#!/usr/bin/env python32"""Validate SFT (Supervised Fine-Tuning) JSONL files for Azure AI Foundry.34Adapted from foundry-ft agent with additional checks from our platform gotchas:5- Token length warnings (4096 limit varies by model)6- System prompt consistency check7"""8import json9import sys101112try:13sys.stdout.reconfigure(encoding="utf-8")14sys.stderr.reconfigure(encoding="utf-8")15except (AttributeError, OSError):16pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine17VALID_ROLES = {"system", "user", "assistant", "tool"}181920def estimate_tokens(text: str) -> int:21"""Rough token estimate: ~4 chars per token for English text."""22return max(1, len(text) // 4)232425def validate_sft(filepath: str) -> None:26errors = []27warnings = []28total = 029token_counts = []30system_prompts = set()3132with open(filepath, "r", encoding="utf-8") as f:33for line_num, line in enumerate(f, 1):34line = line.strip()35if not line:36continue37total += 13839try:40record = json.loads(line)41except json.JSONDecodeError as e:42errors.append(f"Line {line_num}: Invalid JSON — {e}")43continue4445if "messages" not in record:46errors.append(f"Line {line_num}: Missing 'messages' field")47continue4849messages = record["messages"]50if not isinstance(messages, list) or len(messages) == 0:51errors.append(f"Line {line_num}: 'messages' must be a non-empty array")52continue5354roles_found = set()55total_text = ""56for i, msg in enumerate(messages):57if "role" not in msg:58errors.append(f"Line {line_num}, message {i}: Missing 'role'")59elif msg["role"] not in VALID_ROLES:60errors.append(f"Line {line_num}, message {i}: Invalid role '{msg['role']}' (expected: {VALID_ROLES})")61else:62roles_found.add(msg["role"])6364if "content" not in msg and "tool_calls" not in msg:65errors.append(f"Line {line_num}, message {i}: Missing 'content' (and no 'tool_calls')")66elif "content" in msg and msg["content"] is not None:67content = str(msg["content"])68if not content.strip():69warnings.append(f"Line {line_num}, message {i}: Empty content string")70total_text += content7172if msg.get("role") == "system":73system_prompts.add(content.strip()[:100])7475if "user" not in roles_found:76errors.append(f"Line {line_num}: No 'user' message found")77if "assistant" not in roles_found:78errors.append(f"Line {line_num}: No 'assistant' message found")7980tokens = estimate_tokens(total_text)81token_counts.append(tokens)82if tokens > 4096:83warnings.append(f"Line {line_num}: ~{tokens} tokens (exceeds 4096 limit for most models)")8485# Report86print(f"\n{'='*60}")87print(f"SFT Validation Report: {filepath}")88print(f"{'='*60}")89print(f"Total records: {total}")90print(f"Errors: {len(errors)}")91print(f"Warnings: {len(warnings)}")9293if token_counts:94avg_tok = sum(token_counts) / len(token_counts)95print(f"\nToken stats (approx):")96print(f" Avg: {avg_tok:.0f} Min: {min(token_counts)} Max: {max(token_counts)}")97print(f" Total: {sum(token_counts):,}")9899if len(system_prompts) > 1:100warnings.append(f"Found {len(system_prompts)} different system prompts — ensure this is intentional")101if system_prompts:102print(f"\nSystem prompts: {len(system_prompts)} unique")103104if errors:105print(f"\n❌ ERRORS (must fix):")106for e in errors[:20]:107print(f" • {e}")108if len(errors) > 20:109print(f" ... and {len(errors) - 20} more errors")110111if warnings:112print(f"\n⚠️ WARNINGS:")113for w in warnings[:10]:114print(f" • {w}")115116if not errors:117print(f"\n✅ Data is valid for SFT fine-tuning!")118else:119print(f"\n❌ Fix {len(errors)} error(s) before submitting.")120sys.exit(1)121122123if __name__ == "__main__":124if len(sys.argv) != 2:125print("Usage: python validate_sft.py <path-to-jsonl>")126sys.exit(1)127validate_sft(sys.argv[1])128