Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/validate/validate_sft.py
1#!/usr/bin/env python32"""Validate SFT (Supervised Fine-Tuning) JSONL files for Azure AI Foundry.34Adapted from foundry-ft agent with additional checks from our platform gotchas:5- Token length warnings (4096 limit varies by model)6- System prompt consistency check7"""8import json9import sys101112try:13sys.stdout.reconfigure(encoding="utf-8")14sys.stderr.reconfigure(encoding="utf-8")15except (AttributeError, OSError):16pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine17VALID_ROLES = {"system", "user", "assistant", "tool"}181920def estimate_tokens(text: str) -> int:21"""Rough token estimate: ~4 chars per token for English text."""22return max(1, len(text) // 4)232425def validate_sft(filepath: str) -> None:26errors = []27warnings = []28total = 029token_counts = []30system_prompts = set()3132with open(filepath, "r", encoding="utf-8") as f:33for line_num, line in enumerate(f, 1):34line = line.strip()35if not line:36continue37total += 13839try:40record = json.loads(line)41except json.JSONDecodeError as e:42errors.append(f"Line {line_num}: Invalid JSON — {e}")43continue4445if "messages" not in record:46errors.append(f"Line {line_num}: Missing 'messages' field")47continue4849messages = record["messages"]50if not isinstance(messages, list) or len(messages) == 0:51errors.append(f"Line {line_num}: 'messages' must be a non-empty array")52continue5354roles_found = set()55total_text = ""56for i, msg in enumerate(messages):57if "role" not in msg:58errors.append(f"Line {line_num}, message {i}: Missing 'role'")59elif msg["role"] not in VALID_ROLES:60errors.append(f"Line {line_num}, message {i}: Invalid role '{msg['role']}' (expected: {VALID_ROLES})")61else:62roles_found.add(msg["role"])6364if "content" not in msg and "tool_calls" not in msg:65errors.append(f"Line {line_num}, message {i}: Missing 'content' (and no 'tool_calls')")66elif "content" in msg and msg["content"] is not None:67content = str(msg["content"])68if not content.strip():69warnings.append(f"Line {line_num}, message {i}: Empty content string")70total_text += content7172if msg.get("role") == "system":73system_prompts.add(content.strip()[:100])7475if "user" not in roles_found:76errors.append(f"Line {line_num}: No 'user' message found")77if "assistant" not in roles_found:78errors.append(f"Line {line_num}: No 'assistant' message found")7980tokens = estimate_tokens(total_text)81token_counts.append(tokens)82if tokens > 4096:83warnings.append(f"Line {line_num}: ~{tokens} tokens (exceeds 4096 limit for most models)")8485# Report86print(f"\n{'='*60}")87print(f"SFT Validation Report: {filepath}")88print(f"{'='*60}")89print(f"Total records: {total}")90print(f"Errors: {len(errors)}")91print(f"Warnings: {len(warnings)}")9293if token_counts:94avg_tok = sum(token_counts) / len(token_counts)95print(f"\nToken stats (approx):")96print(f" Avg: {avg_tok:.0f} Min: {min(token_counts)} Max: {max(token_counts)}")97print(f" Total: {sum(token_counts):,}")9899if len(system_prompts) > 1:100warnings.append(f"Found {len(system_prompts)} different system prompts — ensure this is intentional")101if system_prompts:102print(f"\nSystem prompts: {len(system_prompts)} unique")103104if errors:105print(f"\n❌ ERRORS (must fix):")106for e in errors[:20]:107print(f" • {e}")108if len(errors) > 20:109print(f" ... and {len(errors) - 20} more errors")110111if warnings:112print(f"\n⚠️ WARNINGS:")113for w in warnings[:10]:114print(f" • {w}")115116if not errors:117print(f"\n✅ Data is valid for SFT fine-tuning!")118else:119print(f"\n❌ Fix {len(errors)} error(s) before submitting.")120sys.exit(1)121122123if __name__ == "__main__":124if len(sys.argv) != 2:125print("Usage: python validate_sft.py <path-to-jsonl>")126sys.exit(1)127validate_sft(sys.argv[1])128